dotnet api dealing with huge json

Imagine that you need to parse huge file - solution will be easy - read and process it line by line.

What about HTTP calls?

Lets pretend we have single endpoint that accepts some JSON, what will happen if request size is 1Tb? (pretending it won't timeout and no network issues will appear)

The simplest possible way here is to do the same approach as with files - grab request body stream and just "read it line by line"

But here is the catch - if in case of text files it is easy, then how it can be done with JSON?

app.MapPost("/demo", async (HttpRequest request, HttpResponse response) => {
    using var reader = new StreamReader(request.Body);
    using var json = JsonDocument.Parse(await reader.ReadToEndAsync());

    if (json.RootElement.ValueKind != JsonValueKind.Array)
    {
        response.StatusCode = StatusCodes.Status400BadRequest;
        return;
    }

    foreach (var element in json.RootElement.EnumerateArray())
    {
        try
        {
            var id = element.GetProperty("id").GetInt32();
            var movie = JsonSerializer.Deserialize<Movie>(element.GetRawText());
            Console.WriteLine($"Received movie: {id} - {movie?.Title} {movie?.Year}");
        }
        catch (Exception ex)
        {
            Console.WriteLine(ex.Message + ", line: " + element.GetRawText());
        }
    }

    response.StatusCode = StatusCodes.Status200OK;
});

So with this approach we can do something like:

curl -s -X POST localhost:5000/demo -H 'Content-Type: application/json' -d @movies.json
[
{ "id": 1, "title": "The Shawshank Redemption", "year": 1994 },
    {
        "id": 2,
        "title": "The Godfather",
        "year": 1972
    },
    { "title": "Hello World", "body": "mac was here" },
    { "id": 3, "title" : "The Dark Knight", "year": 2008 },
    { "id": "ISBN1", "title": "The Lord of the Rings", "year": 1954 },
    {
        "id": 4,
        "title": "The Godfather: Part II",
        "year": 1974}
]

The beauty of this approach is that we are technically working with usual json, just reading it piece by piece

new line delimited json

Another possible approach will be to use ndjson as an input, here is example of how it may look like

app.MapPost("/ndjson", async (HttpRequest request, HttpResponse response, CancellationToken token) =>
{
    response.StatusCode = StatusCodes.Status200OK;
    response.Headers.ContentType = "text/event-stream"; // Chrome likes that more than "application/x-ndjson" and display results as they appear
    // response.Headers.TransferEncoding = "chunked"; // dotnet do not like that, response will be empty

    await response.StartAsync(token);

    using var reader = new StreamReader(request.BodyReader.AsStream());
    while (await reader.ReadLineAsync(token) is { } line)
    {
        try
        {
            var movie = JsonSerializer.Deserialize<Movie>(line);
            Console.WriteLine($"Received movie: {movie?.Id} - {movie?.Title} {movie?.Year}");
            await response.WriteAsync($"{JsonSerializer.Serialize(new { id = movie?.Id, message = "OK" })}\r\n", token);
        }
        catch (Exception ex)
        {
            Console.WriteLine("InvalidOperationException: " + ex.Message + ", Line: " + line);
            await response.WriteAsync($"{JsonSerializer.Serialize(new { id = 0, message = ex.Message, line })}\r\n", token);
        }
    }
    await response.CompleteAsync();
});

Notes:

  • Chrome do like text/event-stream content type more than application/x-ndjson and will display results as they appear
  • If you will try to pass chunked transfer endcoding - everything will broke and your responses will be empty - dotnet specific deep inside

And now we can test it

curl -s -X POST localhost:5000/ndjson -H 'Content-Type: application/x-ndjson' -d '{ "id": 1, "title": "The Shawshank Redemption", "year": 1994 }
{"id": 2, "title": "The Godfather", "year": 1972}
{ "title": "Hello World", "body": "mac was here" }
{ "id": 3, "title" : "The Dark Knight", "year": 2008 }
{ "id": "ISBN1", "title": "The Lord of the Rings", "year": 1954 }
{"id": 4, "title": "The Godfather: Part II", "year": 1974}
'

and our response will be something like:

{"id":1,"message":"OK"}
{"id":2,"message":"OK"}
{"id":0,"message":"OK"}
{"id":3,"message":"OK"}
{"id":0,"message":"The JSON value could not be converted to System.Int32. Path: $.id | LineNumber: 0 | BytePositionInLine: 15.","line":"{ \u0022id\u0022: \u0022ISBN1\u0022, \u0022title\u0022: \u0022The Lord of the Rings\u0022, \u0022year\u0022: 1954 }"}
{"id":4,"message":"OK"}

to see the beauty of this, just put some sleep delays in while loop

Here is full example of what I have ended up with

using System.Text.Json;
using System.Text.Json.Serialization;

var builder = WebApplication.CreateBuilder(args);

var app = builder.Build();

app.MapGet("/", () => "Hello World!");

/*
curl -s -X POST localhost:5000/demo -H 'Content-Type: application/json' -d @movies.json
*/
app.MapPost("/demo", async (HttpRequest request, HttpResponse response) => {
    using var reader = new StreamReader(request.Body);
    using var json = JsonDocument.Parse(await reader.ReadToEndAsync());

    if (json.RootElement.ValueKind != JsonValueKind.Array)
    {
        response.StatusCode = StatusCodes.Status400BadRequest;
        return;
    }

    foreach (var element in json.RootElement.EnumerateArray())
    {
        try
        {
            var id = element.GetProperty("id").GetInt32();
            var movie = JsonSerializer.Deserialize<Movie>(element.GetRawText());
            Console.WriteLine($"Received movie: {id} - {movie?.Title} {movie?.Year}");
        }
        catch (KeyNotFoundException ex)
        {
            Console.WriteLine("KeyNotFoundException: " + ex.Message);
            Console.WriteLine(element.GetRawText());
        }
        catch (InvalidOperationException ex)
        {
            Console.WriteLine("InvalidOperationException: " + ex.Message);
            Console.WriteLine(element.GetRawText());
        }
    }

    response.StatusCode = StatusCodes.Status200OK;
});


app.MapPost("/second", async (HttpRequest request) =>
{
    using var reader = new StreamReader(request.Body);
    using var json = JsonDocument.Parse(await reader.ReadToEndAsync());

    foreach (var element in json.RootElement.EnumerateArray())
    {
        if (element.TryGetProperty("id", out var idProperty) &&
            element.TryGetProperty("title", out var titleProperty) &&
            element.TryGetProperty("year", out var yearProperty))
        {
            var id = idProperty.GetInt32();
            var title = titleProperty.GetString();
            var year = yearProperty.GetInt32();

            Console.WriteLine($"Received movie: {id} - {title} {year}");
        }
        else
        {
            Console.WriteLine("Error processing movie");
        }
    }
});

/*
curl -s -X POST localhost:5000/ndjson -H 'Content-Type: application/x-ndjson' -d '{ "id": 1, "title": "The Shawshank Redemption", "year": 1994 }
{"id": 2, "title": "The Godfather", "year": 1972}
{ "title": "Hello World", "body": "mac was here" }
{ "id": 3, "title" : "The Dark Knight", "year": 2008 }
{ "id": "ISBN1", "title": "The Lord of the Rings", "year": 1954 }
{"id": 4, "title": "The Godfather: Part II", "year": 1974}
'
*/
app.MapPost("/ndjson", async (HttpRequest request, HttpResponse response, CancellationToken token) =>
{
    response.StatusCode = StatusCodes.Status200OK;
    response.Headers.ContentType = "text/event-stream"; // Chrome likes that more than "application/x-ndjson" and display results as they appear
    // response.Headers.TransferEncoding = "chunked"; // dotnet do not like that, response will be empty

    await response.StartAsync(token);

    using var reader = new StreamReader(request.BodyReader.AsStream());
    while (await reader.ReadLineAsync(token) is { } line)
    {
        try
        {
            var movie = JsonSerializer.Deserialize<Movie>(line);
            Console.WriteLine($"Received movie: {movie?.Id} - {movie?.Title} {movie?.Year}");
            await response.WriteAsync($"{JsonSerializer.Serialize(new { id = movie?.Id, message = "OK" })}\r\n", token);
        }
        catch (JsonException ex)
        {
            Console.WriteLine("InvalidOperationException: " + ex.Message + ", Line: " + line);
            await response.WriteAsync($"{JsonSerializer.Serialize(new { id = 0, message = ex.Message, line })}\r\n", token);
        }
    }
    await response.CompleteAsync();
});

app.Run();

public record Movie {
    [JsonPropertyName("id")]
    public int Id { get; init; }

    [JsonPropertyName("title")]
    public required string Title { get; init; }

    [JsonPropertyName("year")]
    public int Year { get; init; }
}