How streaming LLM APIs work#

I decided to have a poke around and see if I could figure out how the HTTP streaming APIs from the various hosted LLM providers actually worked. Here are my notes so far.

The general pattern#

All three of the APIs I investigated worked roughly the same: they return data with a content-type: text/event-stream header, which matches the server-sent events mechanism, then stream blocks separated by \r\n\r\n. Each block has a data: JSON line. Anthropic also include a event: line with an event type.

Annoyingly these can’t be directly consumed using the browser EventSource API because that only works for GET requests, and these APIs all use POST.

OpenAI#

The following curl incantation runs a prompt through GPT-4o Mini and requests a streaming respones. The "stream_options": {"include_usage": true} bit requests that the final message in the stream include details of how many input and output tokens were charged while processing the prompt.

1
curl https://api.openai.com/v1/chat/completions \
2
  -H "Content-Type: application/json" \
3
  -H "Authorization: Bearer $OPENAI_API_KEY" \
4
  -d '{
5
    "model": "gpt-4o-mini",
6
    "messages": [{"role": "user", "content": "Tell me a joke"}],
7
    "stream": true,
8
    "stream_options": {
9
      "include_usage": true
10
    }
11
  }' \
12
  --no-buffer

That --no-buffer option ensures curl outptus the stream to the console as it arrives. Here’s what I got back, with the middle truncated (see this Gist for the whole thing):

1
data: {"id":"chatcmpl-A8dyC7f6pKkQ516qqRHK6ep7Z3yG9","object":"chat.completion.chunk","created":1726623632,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_483d39d857","choices":[{"index":0,"delta":{"role":"assistant","content":"","refusal":null},"logprobs":null,"finish_reason":null}],"usage":null}
2

3
data: {"id":"chatcmpl-A8dyC7f6pKkQ516qqRHK6ep7Z3yG9","object":"chat.completion.chunk","created":1726623632,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_483d39d857","choices":[{"index":0,"delta":{"content":"Why"},"logprobs":null,"finish_reason":null}],"usage":null}
4

5
data: {"id":"chatcmpl-A8dyC7f6pKkQ516qqRHK6ep7Z3yG9","object":"chat.completion.chunk","created":1726623632,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_483d39d857","choices":[{"index":0,"delta":{"content":" did"},"logprobs":null,"finish_reason":null}],"usage":null}
6

7
[...]
8

9
data: {"id":"chatcmpl-A8dyC7f6pKkQ516qqRHK6ep7Z3yG9","object":"chat.completion.chunk","created":1726623632,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_483d39d857","choices":[{"index":0,"delta":{"content":"!"},"logprobs":null,"finish_reason":null}],"usage":null}
10

11
data: {"id":"chatcmpl-A8dyC7f6pKkQ516qqRHK6ep7Z3yG9","object":"chat.completion.chunk","created":1726623632,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_483d39d857","choices":[{"index":0,"delta":{},"logprobs":null,"finish_reason":"stop"}],"usage":null}
12

13
data: {"id":"chatcmpl-A8dyC7f6pKkQ516qqRHK6ep7Z3yG9","object":"chat.completion.chunk","created":1726623632,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_483d39d857","choices":[],"usage":{"prompt_tokens":11,"completion_tokens":18,"total_tokens":29,"completion_tokens_details":{"reasoning_tokens":0}}}
14

15
data: [DONE]

Those newlines between the chunks are actually \r\n sequences.

Interesting HTTP headers (I used curl -vv to see these):

1
content-type: text/event-stream; charset=utf-8
2
access-control-expose-headers: X-Request-ID
3
openai-organization: user-r...
4
openai-processing-ms: 81
5
openai-version: 2020-10-01
6
strict-transport-security: max-age=15552000; includeSubDomains; preload
7
x-ratelimit-limit-requests: 30000
8
x-ratelimit-limit-tokens: 150000000
9
x-ratelimit-remaining-requests: 29999
10
x-ratelimit-remaining-tokens: 149999979
11
x-ratelimit-reset-requests: 2ms
12
x-ratelimit-reset-tokens: 0s
13
x-request-id: req_31f3a97f8a5d473aebfa2fa074935618

Anthropic Claude#

Here’s the same prompt agaist Claude 3 Sonnet:

1
curl https://api.anthropic.com/v1/messages \
2
  -H "Content-Type: application/json" \
3
  -H "x-api-key: $ANTHROPIC_API_KEY" \
4
  -H "anthropic-version: 2023-06-01" \
5
  -d '{
6
    "model": "claude-3-sonnet-20240229",
7
    "messages": [{"role": "user", "content": "Tell me a joke"}],
8
    "stream": true,
9
    "max_tokens": 1024
10
  }' \
11
  --no-buffer

The max_tokens option is required by the Anthropic API.

I got back this (it’s shorter so I didn’t truncate it):

1
event: message_start
2
data: {"type":"message_start","message":{"id":"msg_01SxRKvzSAbPKgXu4781JHjw","type":"message","role":"assistant","model":"claude-3-sonnet-20240229","content":[],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":11,"output_tokens":1}}               }
3

4
event: content_block_start
5
data: {"type":"content_block_start","index":0,"content_block":{"type":"text","text":""}    }
6

7
event: ping
8
data: {"type": "ping"}
9

10
event: content_block_delta
11
data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":"Here"}  }
12

13
event: content_block_delta
14
data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":"'s a silly"}}
15

16
event: content_block_delta
17
data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":" joke for you:"}           }
18

19
event: content_block_delta
20
data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":"\n\nWhy"}     }
21

22
event: content_block_delta
23
data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":" can"}    }
24

25
event: content_block_delta
26
data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":"'t a bicycle"}      }
27

28
event: content_block_delta
29
data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":" stand up by"} }
30

31
event: content_block_delta
32
data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":" itself?"}    }
33

34
event: content_block_delta
35
data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":"\nBecause it's two"} }
36

37
event: content_block_delta
38
data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":"-tired!"}   }
39

40
event: content_block_stop
41
data: {"type":"content_block_stop","index":0             }
42

43
event: message_delta
44
data: {"type":"message_delta","delta":{"stop_reason":"end_turn","stop_sequence":null},"usage":{"output_tokens":30}           }
45

46
event: message_stop
47
data: {"type":"message_stop"             }

Interesting HTTP headers:

1
content-type: text/event-stream; charset=utf-8
2
cache-control: no-cache
3
anthropic-ratelimit-requests-limit: 4000
4
anthropic-ratelimit-requests-remaining: 3999
5
anthropic-ratelimit-requests-reset: 2024-09-21T19:44:06Z
6
anthropic-ratelimit-tokens-limit: 400000
7
anthropic-ratelimit-tokens-remaining: 399000
8
anthropic-ratelimit-tokens-reset: 2024-09-21T19:43:44Z
9
request-id: req_0189EJVDRQDoLyxjoNqG8Dw7

Google Gemini#

Google Gemini returns much larger tokens chunks, so I had to prompt “Tell me a very long joke” to get back a streaming response that included multiple parts:

1
curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:streamGenerateContent?alt=sse&key=${GOOGLE_API_KEY}" \
2
  -H 'Content-Type: application/json' \
3
  -d '{
4
    "contents": [
5
      {
6
        "parts": [
7
          {"text": "Tell me a very long joke"}
8
        ]
9
      }
10
    ]
11
  }' \
12
  --no-buffer

I got back this:

1
data: {"candidates": [{"content": {"parts": [{"text": "A man walks into a library and asks for books about paranoia. The librarian whispers"}],"role": "model"},"finishReason": "STOP","index": 0,"safetyRatings": [{"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT","probability": "NEGLIGIBLE"},{"category": "HARM_CATEGORY_HATE_SPEECH","probability": "NEGLIGIBLE"},{"category": "HARM_CATEGORY_HARASSMENT","probability": "NEGLIGIBLE"},{"category": "HARM_CATEGORY_DANGEROUS_CONTENT","probability": "NEGLIGIBLE"}]}],"usageMetadata": {"promptTokenCount": 6,"candidatesTokenCount": 16,"totalTokenCount": 22}}
2

3
data: {"candidates": [{"content": {"parts": [{"text": ", \"They're right behind you!\" The man screams and runs out of the library.\nA few days later, he returns and asks for books about"}],"role": "model"},"finishReason": "STOP","index": 0,"safetyRatings": [{"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT","probability": "NEGLIGIBLE"},{"category": "HARM_CATEGORY_HATE_SPEECH","probability": "NEGLIGIBLE"},{"category": "HARM_CATEGORY_HARASSMENT","probability": "NEGLIGIBLE"},{"category": "HARM_CATEGORY_DANGEROUS_CONTENT","probability": "NEGLIGIBLE"}]}],"usageMetadata": {"promptTokenCount": 6,"candidatesTokenCount": 48,"totalTokenCount": 54}}
4

5
data: {"candidates": [{"content": {"parts": [{"text": " invisibility. The librarian whispers, \"They're right behind you!\" Again, the man screams and runs out.\nThe next day, the man comes back and asks for books about immortality. The librarian whispers, \"They're on the second floor, to the left.\" The man starts to go upstairs,"}],"role": "model"},"finishReason": "STOP","index": 0,"safetyRatings": [{"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT","probability": "NEGLIGIBLE"},{"category": "HARM_CATEGORY_HATE_SPEECH","probability": "NEGLIGIBLE"},{"category": "HARM_CATEGORY_HARASSMENT","probability": "NEGLIGIBLE"},{"category": "HARM_CATEGORY_DANGEROUS_CONTENT","probability": "NEGLIGIBLE"}]}],"usageMetadata": {"promptTokenCount": 6,"candidatesTokenCount": 112,"totalTokenCount": 118}}
6

7
data: {"candidates": [{"content": {"parts": [{"text": " then turns and whispers to the librarian, \"Are you sure they're there?\" The librarian whispers back, \"I'm not sure. I just saw you go up there.\""}],"role": "model"},"finishReason": "STOP","index": 0,"safetyRatings": [{"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT","probability": "NEGLIGIBLE"},{"category": "HARM_CATEGORY_HATE_SPEECH","probability": "NEGLIGIBLE"},{"category": "HARM_CATEGORY_HARASSMENT","probability": "NEGLIGIBLE"},{"category": "HARM_CATEGORY_DANGEROUS_CONTENT","probability": "NEGLIGIBLE"}]}],"usageMetadata": {"promptTokenCount": 6,"candidatesTokenCount": 149,"totalTokenCount": 155}}

HTTP headers:

1
content-type: text/event-stream
2
content-disposition: attachment
3
vary: Origin
4
vary: X-Origin
5
vary: Referer
6
date: Sat, 21 Sep 2024 19:46:22 GMT
7
server: scaffolding on HTTPServer2
8
x-xss-protection: 0
9
x-frame-options: SAMEORIGIN
10
x-content-type-options: nosniff
11
server-timing: gfet4t7; dur=911
12
alt-svc: h3=":443"; ma=2592000,h3-29=":443"; ma=2592000

Bonus: accessing these streams using HTTPX#

I like using the HTTPX client library for Python. Here’s how to use that to show the output of a stream to the console, using the handy httpx-sse package:

1
import os
2
import json
3
import asyncio
4
import httpx
5
from httpx_sse import aconnect_sse
6

7
async def stream_openai_response(prompt):
8
    api_key = os.environ.get("OPENAI_API_KEY")
9
    if not api_key:
10
        raise ValueError("OPENAI_API_KEY not found in environment variables")
11
    url = "https://api.openai.com/v1/chat/completions"
12
    headers = {
13
        "Content-Type": "application/json",
14
        "Authorization": f"Bearer {api_key}"
15
    }
16
    data = {
17
        "model": "gpt-4-turbo-preview",
18
        "messages": [{"role": "user", "content": prompt}],
19
        "stream": True,
20
        "stream_options": {
21
            "include_usage": True
22
        }
23
    }
24
    async with httpx.AsyncClient() as client:
25
        async with aconnect_sse(client, "POST", url, json=data, headers=headers) as event_source:
26
            async for sse in event_source.aiter_sse():
27
                if sse.event == "error":
28
                    print(f"Error: {sse.data}")
29
                elif sse.event == "usage":
30
                    usage = json.loads(sse.data)
31
                    print(f"Usage: {usage}")
32
                else:
33
                    try:
34
                        chunk = json.loads(sse.data)
35
                        if chunk['choices'][0]['finish_reason'] is not None:
36
                            break
37
                        content = chunk['choices'][0]['delta'].get('content', '')
38
                        print(content, end='', flush=True)
39
                    except json.JSONDecodeError:
40
                        print(f"Failed to parse JSON: {sse.data}")
41

42
async def main():
43
    prompt = "Tell me a joke"
44
    await stream_openai_response(prompt)
45

46
if __name__ == "__main__":
47
    asyncio.run(main())

Bonus 2: Processing streaming events in JavaScript with fetch()#

With the help of Claude, here’s some JavaScript code (using asynchronous iterators) that can make an API request to this kind of stream and log out the events as they come in:

1
async function* sseStreamIterator(apiUrl, requestBody, extraHeaders)  {
2
  const response = await fetch(apiUrl, {
3
    method: 'POST',
4
    headers: { ...{'Content-Type': 'application/json'}, ...(extraHeaders || {}) },
5
    body: JSON.stringify(requestBody),
6
  });
7

8
  if (!response.ok) {
9
    throw new Error(`HTTP error! status: ${response.status}`);
10
  }
11
  const reader = response.body.getReader();
12
  const decoder = new TextDecoder();
13
  let buffer = '';
14

15
  while (true) {
16
    const { done, value } = await reader.read();
17
    if (done) break; // value is always undefined is done is true
18

19
    // stream: true ensures multi-byte characters are handled correctly
20
    buffer += decoder.decode(value, { stream: true });
21
    const events = buffer.split(/\r?\n\r?\n/);
22
    buffer = events.pop() || '';
23

24
    for (const event of events) {
25
      const lines = event.split(/\r?\n/);
26
      const parsedEvent = {};
27

28
      for (const line of lines) {
29
        if (line.startsWith('data: ')) {
30
          const dataContent = line.slice(6);
31
          try {
32
            parsedEvent.data = JSON.parse(dataContent);
33
          } catch (error) {
34
            parsedEvent.data = null;
35
            parsedEvent.data_raw = dataContent;
36
          }
37
        } else if (line.includes(': ')) {
38
          const [key, value] = line.split(': ', 2);
39
          parsedEvent[key] = value;
40
        }
41
      }
42

43
      if (Object.keys(parsedEvent).length > 0) {
44
        yield parsedEvent;
45
      }
46
    }
47
  }
48
}
49

50
async function handleSSE() {
51
  const apiUrl = 'https://api.openai.com/v1/chat/completions';
52
  const requestBody = {
53
    "model": "gpt-4o-mini",
54
    "messages": [{"role": "user", "content": "Tell me a joke"}],
55
    "stream": true,
56
    "stream_options": {
57
      "include_usage": true
58
    }
59
  };
60
  for await (const event of sseStreamIterator(apiUrl, requestBody, {
61
    Authorization: 'Bearer sk-...'
62
  })) {
63
    console.log(event);
64
  }
65
}
66

67
handleSSE()