Node.js: Optimisations on parsing large JSON file

What if you have 14GB of JSON data, and you want to read it in Node.js and perform a filtering operation. How would you do it?

Type of the data

For the Backend DB architecture project, with a friend we generated 15,930,001 posts for a total of 14GB of data. Each post has the following type:

export class Post {
  public client : number; //ID of the Client
  public channel : number; //ID of the Channel
  public id : number
  public created_date : number;
  public text : string;
  public post_type : "TEXT" | "IMAGE" | "VIDEO" | "LINK";
  public labels: string [];  //max 10
  public insights: number[]; //array of 100 integers

}

Baseline

Good benchmark for the minimum time to scan the file is to compare with the "wc" utility. It yields 12.1 seconds scan time for "\n":

time wc -l ../posts.json
15930000 ../posts.json
wc -l ../posts.json  8.23s user 3.32s system 94% cpu 12.191 total

Filtering for a specific client with "grep" yields 3 minutes 15 seconds. Even though grep is C++, I would guess that the "regular expression" matching takes it toll:

time grep '"client":1206' ../posts.json | wc -l
grep '"client":1206' ../posts.json  191.65s user 3.70s system 99% cpu 3:15.77 total
wc -l  0.01s user 0.02s system 0% cpu 3:15.77 total

Node.js

Doing a scan of the 14GB file with fs.createReadStream, clocks in in 13 seconds, which is pretty reasonable and on par with "wc -l".

var fs = require('fs');


var begin = new Date().valueOf();
let size  = 0;
function test() {
    let post_stream = fs.createReadStream("../posts.json", {encoding:"utf8"});

    post_stream
        .on("data", post=>{
            // console.log("data", post.length)
            size += post.length;
        })
        .on("end", ()=>{
            console.log(
                `Finished: 
                 ${ Math.floor( ((new Date).valueOf() - begin)/1000 ) } seconds. 
                 MB: ${size/1024/1024}
            `)
        })
}

test();

But then I have only the raw "data". I need the JSONs of each post, so I can compare each field and filter. So I used JSONStream library:

var fs = require('fs')
, JSONStream = require('JSONStream');


var begin = new Date().valueOf();
let size  = 0;
function test() {
    let post_stream = fs.createReadStream("../posts.json", {encoding:"utf8"});

    post_stream
        .pipe(JSONStream.parse('*'))
        .on("data", post=>{
            // size += post.length;
        })
        .on("end", ()=>{
            console.log(
                `Finished: 
                ${ Math.floor( ((new Date).valueOf() - begin)/1000 ) } seconds. 
            `)
        })
}

test();

But just by piping the JSONStream the time jumped from 13 seconds to 18 minutes and 8 seconds (1128 seconds)!! 100 X worse! Either this library is optimised for convenience only, disregarding performance, or parsing JSONs is inherently slow in Node.js. Lets find out!

Lets try to recreate JSONStream with our knowledge of the very specific layout of the Post file. We know that the file is array of Posts. Each Post is an object delimited by opening "{" and closing "}" (see the last paragraph for a sample post):

var fs = require('fs');


var _timestamp_start = new Date().valueOf();
let _size = 0;
let _number_posts  = 0;

let _post_begin_index = 0;
let _post_end_index = 0;
let _buffer = "";
function test() {
    let post_stream = fs.createReadStream("../posts.json", {encoding:"utf8"});

    post_stream
        .on("data", chunk=>{
            _size += chunk.length;
            _buffer += chunk; // append the current chunk
            do {
                _post_begin_index = _buffer.indexOf("{", _post_end_index);
                if (_post_begin_index==-1) {
                    // could not find beginning of a new post
                    _buffer = ""; // clear up the buffer
                    _post_begin_index = 0;
                    _post_end_index = 0;
                    break;
                }

                _post_end_index = _buffer.indexOf("}", _post_begin_index);
                if (_post_end_index==-1) {
                    // could not find end of a post
                    _buffer = _buffer.slice(_post_begin_index); // trim the buffer
                    _post_begin_index = 0;
                    _post_end_index = 0;
                    break;
                }

                post = JSON.parse(
                    _buffer.slice(_post_begin_index,_post_end_index+1)
                );
                _number_posts++;
                // console.log(post)


            } while(_buffer.length)
        })
        .on("end", ()=>{
            console.log(`Finished: 
                    ${ Math.floor( ((new Date).valueOf() - _timestamp_start)/1000 ) } seconds. 
                    MB: ${_size/1024/1024}
                    Number posts: ${_number_posts}
                `)
        })
}

test();

This time the run was 78 seconds. Which is "only" 5x worse than just reading the data file.

The string operations in Javascript are with "copy", and maybe the JSON.parse is not efficient? Maybe we can execute JSON.parse on bigger chunks of data instead of single posts. Lets modified the above code to parse in one go multiple posts (i.e. at around 65K bytes of posts).

var fs = require('fs');


var _timestamp_start = new Date().valueOf();
let _size = 0;
let _number_posts  = 0;

let _post_begin_index = 0;
let _post_end_index = 0;
let _buffer = "";
function test() {
    let post_stream = fs.createReadStream("../posts.json", {encoding:"utf8"});

    post_stream
        .on("data", chunk=>{
            _size += chunk.length;
            _buffer += chunk; // append the current chunk
            do {
                _post_begin_index = _buffer.indexOf("{", _post_end_index);
                if (_post_begin_index==-1) {
                    // could not find beginning of a new post
                    _buffer = ""; // clear up the buffer
                    _post_begin_index = 0;
                    _post_end_index = 0;
                    break;
                }

                _post_end_index = _buffer.lastIndexOf("}");
                if (_post_end_index==-1 || _post_end_index<_post_begin_index) {
                    // could not find end of a post
                    _buffer = _buffer.slice(_post_begin_index); // trim the buffer
                    _post_begin_index = 0;
                    _post_end_index = 0;
                    break;
                }

                posts = JSON.parse(
                    "[" +
                    _buffer.slice(_post_begin_index,_post_end_index+1) +
                    "]"
                );
                _number_posts+= posts.length;
                // console.log(post)


            } while(_buffer.length)
        })
        .on("end", ()=>{
            console.log(`Finished: 
                    ${ Math.floor( ((new Date).valueOf() - _timestamp_start)/1000 ) } seconds. 
                    MB: ${_size/1024/1024}
                    Number posts: ${_number_posts}
                `)
        })
}

test();

The time dropped. But only 10 seconds or so - to 69 seconds. Still when comparing to the C++ utility grep with its 3 minutes - this seams reasonable.

Conclusion

I could not find suggestions on how to further improve JSON parse - so giving up on trying to optimise this further. I will assume that this is the best.

When performance is important be careful when choosing a library. Benchmark it. Optimise it.

Sample post

All fields of each Post are randomly generated.

{
  "client": 1206,
  "channel": 39523,
  "id": 8,
  "created_date": 1516330376000,
  "text": "Wumhem uguve olaro nisid rululfup barekfiz jagnu gobum goj ja fajike tedicuk.",
  "post_type": "LINK",
  "labels": [
    "fo",
    "wuvjulaz",
    "zihubju",
    "zaokosi",
    "hosmerab",
    "kakdatwuf",
    "junbi",
    "zinekaku",
    "rujsiov"
  ],
  "insights": [
    100005,
    657875,
    169162,
    689672,
    988617,
    848863,
    506110,
    770147,
    47568,
    712479,
    650352,
    900524,
    973183,
    217062,
    754424,
    711652,
    924751,
    286921,
    363574,
    653247,
    126344,
    832025,
    806129,
    976228,
    66144,
    588530,
    108403,
    817070,
    136224,
    2204,
    422520,
    383058,
    245412,
    761675,
    585509,
    530694,
    452252,
    592180,
    158107,
    265906,
    223294,
    584236,
    36081,
    886935,
    184333,
    811952,
    452309,
    566509,
    997581,
    924844,
    667776,
    637752,
    853350,
    153983,
    684913,
    825527,
    65640,
    277361,
    735523,
    153490,
    892049,
    469048,
    631845,
    517898,
    93978,
    507148,
    455821,
    955607,
    316781,
    89193,
    826140,
    375440,
    760106,
    930582,
    961828,
    298912,
    57130,
    617704,
    159721,
    162534,
    99695,
    130321,
    433671,
    278697,
    149232,
    632954,
    245728,
    567359,
    693463,
    159013,
    383417,
    593730,
    227523,
    782458,
    997559,
    276812,
    353369,
    685802,
    416554,
    548174
  ]
}

Search This Blog

on various topics