add common-terms

2025-04-25 18:41:06 -04:00
parent 54bf5ca168
commit a38c21b1a9
16 changed files with 1670 additions and 2 deletions
--- a/content/en/posts/llama-cpp/common-terms/content.svg
+++ b/content/en/posts/llama-cpp/common-terms/content.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 455 60" width="455" height="60"><path fill="none" d="M0 0h455v60H0z"/><rect x="10" y="10" width="60" height="40" rx="10" ry="10" fill="#ffec99" stroke="#000" stroke-width="1.5"/><text x="40" y="37.5" font-family="Arial, sans-serif" font-size="20" text-anchor="middle">你好</text><rect x="75" y="10" width="200" height="40" rx="10" ry="10" fill="#a5d8ff" stroke="#000" stroke-width="1.5"/><text x="175" y="37.5" font-family="Arial, sans-serif" font-size="20" text-anchor="middle">我今天能怎么帮助你</text><rect x="280" y="10" width="120" height="40" rx="10" ry="10" fill="#ffec99" stroke="#000" stroke-width="1.5"/><text x="340" y="37.5" font-family="Arial, sans-serif" font-size="20" text-anchor="middle">1+1等于几</text><rect x="405" y="10" width="40" height="40" rx="10" ry="10" fill="#a5d8ff" stroke="#000" stroke-width="1.5"/><text x="425" y="37.5" font-family="Arial, sans-serif" font-size="20" text-anchor="middle">2</text></svg>
--- a/content/en/posts/llama-cpp/common-terms/hello-hello-1plus1-equal2.svg
+++ b/content/en/posts/llama-cpp/common-terms/hello-hello-1plus1-equal2.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 666.5 60" width="666.5" height="60"><path fill="none" d="M0 0h666.5v60H0z"/><rect x="10" y="10" width="69.0" height="40" rx="10" ry="10" fill="#ffec99" stroke="#000" stroke-width="1.5"/><text x="44.5" y="37.5" font-family="Arial, sans-serif" font-size="20" text-anchor="middle">Hello!</text><rect x="84.0" y="10" width="316.0" height="40" rx="10" ry="10" fill="#a5d8ff" stroke="#000" stroke-width="1.5"/><text x="242.0" y="37.5" font-family="Arial, sans-serif" font-size="20" text-anchor="middle">Hello! How can I help you today?</text><rect x="405.0" y="10" width="118.5" height="40" rx="10" ry="10" fill="#ffec99" stroke="#000" stroke-width="1.5"/><text x="464.25" y="37.5" font-family="Arial, sans-serif" font-size="20" text-anchor="middle">1+1等于几</text><rect x="528.5" y="10" width="128.0" height="40" rx="10" ry="10" fill="#a5d8ff" stroke="#000" stroke-width="1.5"/><text x="592.5" y="37.5" font-family="Arial, sans-serif" font-size="20" text-anchor="middle">1+1等于2。</text></svg>
--- a/content/en/posts/llama-cpp/common-terms/hello-hello-1plus1.svg
+++ b/content/en/posts/llama-cpp/common-terms/hello-hello-1plus1.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 533.5 60" width="533.5" height="60"><path fill="none" d="M0 0h533.5v60H0z"/><rect x="10" y="10" width="69.0" height="40" rx="10" ry="10" fill="#ffec99" stroke="#000" stroke-width="1.5"/><text x="44.5" y="37.5" font-family="Arial, sans-serif" font-size="20" text-anchor="middle">Hello!</text><rect x="84.0" y="10" width="316.0" height="40" rx="10" ry="10" fill="#a5d8ff" stroke="#000" stroke-width="1.5"/><text x="242.0" y="37.5" font-family="Arial, sans-serif" font-size="20" text-anchor="middle">Hello! How can I help you today?</text><rect x="405.0" y="10" width="118.5" height="40" rx="10" ry="10" fill="#ffec99" stroke="#000" stroke-width="1.5"/><text x="464.25" y="37.5" font-family="Arial, sans-serif" font-size="20" text-anchor="middle">1+1等于几</text></svg>
--- a/content/en/posts/llama-cpp/common-terms/hello-hello.svg
+++ b/content/en/posts/llama-cpp/common-terms/hello-hello.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 410.0 60" width="410.0" height="60"><path fill="none" d="M0 0h410.0v60H0z"/><rect x="10" y="10" width="69.0" height="40" rx="10" ry="10" fill="#ffec99" stroke="#000" stroke-width="1.5"/><text x="44.5" y="37.5" font-family="Arial, sans-serif" font-size="20" text-anchor="middle">Hello!</text><rect x="84.0" y="10" width="316.0" height="40" rx="10" ry="10" fill="#a5d8ff" stroke="#000" stroke-width="1.5"/><text x="242.0" y="37.5" font-family="Arial, sans-serif" font-size="20" text-anchor="middle">Hello! How can I help you today?</text></svg>
--- a/content/en/posts/llama-cpp/common-terms/hello.svg
+++ b/content/en/posts/llama-cpp/common-terms/hello.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 89.0 60" width="89.0" height="60"><path fill="none" d="M0 0h89.0v60H0z"/><rect x="10" y="10" width="69.0" height="40" rx="10" ry="10" fill="#ffec99" stroke="#000" stroke-width="1.5"/><text x="44.5" y="37.5" font-family="Arial, sans-serif" font-size="20" text-anchor="middle">Hello!</text></svg>
--- a/content/en/posts/llama-cpp/common-terms/index.md
+++ b/content/en/posts/llama-cpp/common-terms/index.md
@@ -0,0 +1,664 @@
+---
+title: Common Terms, Concepts and Explanations of Large Language Models
+subtitle:
+date: 2025-04-15T17:22:42-04:00
+lastmod: 2025-04-15T17:22:42-04:00
+slug: common-terms
+draft: false
+author:
+  name: James
+  link: https://www.jamesflare.com
+  email:
+  avatar: /site-logo.avif
+description: This blog post explains common terms and concepts related to large language models (LLMs), including context windows, temperature, and various model types like multi-modal and reasoning models. It also covers practical applications and API usage examples.
+keywords: ["large language model", "LLM", "AI model", "AI terminology", "AI concept", "LLM terminology", "context window", "temperature", "multimodal model", "inference model"]
+license:
+comment: true
+weight: 0
+tags:
+  - LLM
+  - llama.cpp
+  - Ollama
+categories:
+  - LLM
+collections:
+  - LLM
+hiddenFromHomePage: false
+hiddenFromSearch: false
+hiddenFromRss: false
+hiddenFromRelated: false
+summary: This blog post explains common terms and concepts related to large language models (LLMs), including context windows, temperature, and various model types like multi-modal and reasoning models. It also covers practical applications and API usage examples.
+resources:
+  - name: featured-image
+    src: featured-image.jpg
+  - name: featured-image-preview
+    src: featured-image-preview.jpg
+toc: true
+math: false
+lightgallery: false
+password:
+message:
+repost:
+  enable: false
+  url:
+
+# See details front matter: https://fixit.lruihao.cn/documentation/content-management/introduction/#front-matter
+---
+
+<!--more-->
+
+## How Does the Model Generate Responses?
+
+The mainstream models currently use autoregressive generation, which generates subsequent text based on the preceding context. The model itself has no memory and isn't truly engaging in a conversation with you. Instead, it works like this:
+
+{{< image src="content.svg" caption="Content Window" >}}
+
+Each conversation adds to the context length until it exceeds a limit, known as the context window. When this happens, either the content is discarded or compressed and summarized. This limit is called the context window.
+
+In some cases, if the context window isn't large enough, it can't effectively handle certain content. Google's Gemini 2.5 Pro supports a context length of 1 million tokens, while OpenAI's newly released GPT-4.1 also supports a maximum context length of 1 million tokens, and o3 has a context length of 200,000 tokens.
+
+### Temperature
+
+Temperature is a parameter in LLMs that affects the randomness and creativity of the generated text. A lower temperature, such as 0.2, makes the output more focused and predictable, suitable for tasks requiring accuracy, like technical writing. A higher temperature, such as 0.7, makes the output more diverse and creative, suitable for storytelling or brainstorming.
+
+IBM has written a good article: [What is LLM Temperature?](https://www.ibm.com/think/topics/llm-temperature)
+
+### Top-K and Top-P
+
+Top-K restricts the selection of the next Token to the top K most probable options from the model's output. For example, if K is set to 5, the model will only consider the top 5 most probable Tokens, making the generated text more coherent and predictable. This helps reduce randomness, but setting K too low may limit creativity.
+
+Top-P (also known as nucleus sampling) works by selecting the smallest set of Tokens whose cumulative probability exceeds a specified threshold (e.g., 0.9). This includes all Tokens that collectively account for 90% of the probability mass, thereby allowing for more varied output while avoiding Tokens with very low probabilities that might be meaningless. It is generally considered more flexible than Top-K.
+
+There's a well-written article: [How to generate text: using different decoding methods for language generation with Transformers](https://huggingface.co/blog/how-to-generate)
+
+### Small Practice
+
+If you haven't fully grasped the concept of context, take a look at this request first:
+
+{{< image src="hello.svg" caption="Hello!" >}}
+
+```bash
+curl https://api.openai.com/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer $OPENAI_API_KEY" \
+  -d '{
+    "model": "gpt-4.1",
+    "messages": [
+      {
+        "role": "developer",
+        "content": "You are a helpful assistant."
+      },
+      {
+        "role": "user",
+        "content": "Hello!"
+      }
+    ],
+    "temperature": 0.7
+  }'
+```
+
+It sends a message "Hello!" to the model `gpt-4.1`. Its system prompt is "You are a helpful assistant." with a `temperature` of `0.7`.
+
+> [!Tip]
+> Here, `developer` is a new role. It replaces the previous `system` role in o1 and later models.
+
+You should receive a response like this:
+
+{{< image src="hello-hello.svg" caption="Hello! How can I help you today?" >}}
+
+```json
+{
+  "id": "chatcmpl-BPctVve9zkK7PuUsFvpWbsWzWdR0b",
+  "object": "chat.completion",
+  "created": 1745447409,
+  "model": "gpt-4.1-2025-04-14",
+  "choices": [
+    {
+      "index": 0,
+      "message": {
+        "role": "assistant",
+        "content": "Hello! How can I help you today?",
+        "refusal": null,
+        "annotations": []
+      },
+      "logprobs": null,
+      "finish_reason": "stop"
+    }
+  ],
+  "usage": {
+    "prompt_tokens": 19,
+    "completion_tokens": 10,
+    "total_tokens": 29,
+    "prompt_tokens_details": {
+      "cached_tokens": 0,
+      "audio_tokens": 0
+    },
+    "completion_tokens_details": {
+      "reasoning_tokens": 0,
+      "audio_tokens": 0,
+      "accepted_prediction_tokens": 0,
+      "rejected_prediction_tokens": 0
+    }
+  },
+  "service_tier": "default",
+  "system_fingerprint": "fp_b38e740b47"
+}
+```
+
+A multi-turn conversation would be initiated like this:
+
+{{< image src="hello-hello-1plus1.svg" caption="1+1 equals what?" >}}
+
+```bash
+curl https://api.openai.com/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer $OPENAI_API_KEY" \
+  -d '{
+    "model": "gpt-4.1",
+    "messages": [
+      {
+        "role": "developer",
+        "content": "You are a helpful assistant."
+      },
+      {
+        "role": "user",
+        "content": "Hello!"
+      },
+      {
+        "role": "assistant",
+        "content": "Hello! How can I help you today?"
+      },
+      {
+        "role": "user",
+        "content": "1+1 equals what?"
+      }
+    ],
+    "temperature": 0.7
+  }'
+```
+
+And you would receive a response like this:
+
+{{< image src="hello-hello-1plus1-equal2.svg" caption="1+1 equals 2." >}}
+
+```json
+{
+  "id": "chatcmpl-BPcv4alGyqzbbwN9ZBzXVXUcX7CWS",
+  "object": "chat.completion",
+  "created": 1745447506,
+  "model": "gpt-4.1-2025-04-14",
+  "choices": [
+    {
+      "index": 0,
+      "message": {
+        "role": "assistant",
+        "content": "1+1 equals 2.",
+        "refusal": null,
+        "annotations": []
+      },
+      "logprobs": null,
+      "finish_reason": "stop"
+    }
+  ],
+  "usage": {
+    "prompt_tokens": 42,
+    "completion_tokens": 8,
+    "total_tokens": 50,
+    "prompt_tokens_details": {
+      "cached_tokens": 0,
+      "audio_tokens": 0
+    },
+    "completion_tokens_details": {
+      "reasoning_tokens": 0,
+      "audio_tokens": 0,
+      "accepted_prediction_tokens": 0,
+      "rejected_prediction_tokens": 0
+    }
+  },
+  "service_tier": "default",
+  "system_fingerprint": "fp_a1102cf978"
+}
+```
+
+This is essentially how the API works at its basic level, and you should now have a deeper understanding of the conversation format.
+
+### Common Questions
+
+> **How many rounds of conversation can I have?**
+>
+> Many user-side products limit the number of conversation rounds, with different tiers having different quotas. If you understand the content above, you'll realize that the API charges per request, which is directly related to context length but not to the number of conversation rounds. Therefore, to maximize the value of your quota, you should carefully construct prompts to complete tasks within a few conversation rounds. Additionally, even with the same model, different tier packages may have different Content Window sizes.
+
+> **Does this model support file uploads?**
+>
+> Many users upload files like PPTs and PDFs to the model. However, I want to remind you that models may not inherently read these files. Most models can only handle text input and output; models that can handle multiple input/output formats are called multimodal models. Current mainstream models support image input, making them multimodal models. Some advanced models support voice and video input/output. You may have noticed that they don't support PPTs or PDF files.  
+> In fact, file uploads are implemented independently by each provider. The essence is to parse the file into text content, possibly including images. If you're unsure about the principle, you can check out [Mathpix](https://mathpix.com/), which provides PDF parsing services for multiple companies. It can parse PDFs into Markdown-like text formats and include them in LLM requests for document understanding and QA. So, if you're particular, you can manually paste the text content from the PDF into the request; the results won't be worse, and might even be better.
+
+> **Why can the model still have conversations when the text length clearly exceeds the window limit?**
+>
+> This is also an optimization technique. Early on, GPT-3.5 had a context window of only 4096 tokens, which quickly filled up. At that point, the model would summarize the previous context to compress the context length. In addition to this approach, there are some more advanced techniques, such as retrieval-augmented generation (RAG). However, performance is definitely inferior to the native window, and as the number of conversation rounds increases, model performance also decreases. This was very noticeable in early LLMs, but with years of industry improvements, multi-round conversation performance has greatly improved.
+
+## Detailed Model Types
+
+As models continue to develop, many new technologies and features have been introduced. What are their differences and what do they represent?
+
+### Multimodal Models
+
+If a model can only handle text input and output text, it's a unimodal model. Conversely, it's a multimodal model. For example, GPT-4 initially supported text and image input with text output, introducing image understanding capabilities to LLMs, making them multimodal models.
+
+Now we want models to handle more modalities, such as voice input, video input, image input, and voice output, image generation. These are becoming increasingly common in recently released models.
+
+### Reasoning Models
+
+It was discovered early on that adding phrases like "please think step by step" in the request could significantly improve the accuracy of LLM responses.
+
+OpenAI's o1 was the first truly large-scale commercial reasoning model. Its difference from other models is that it performs CoT (Chain-of-Thought) reasoning before generating responses. The performance improvement is particularly significant in STEM fields like mathematics and for complex tasks. OpenAI wrote an article about its performance improvements: [Learning to reason with LLMs](https://openai.com/index/learning-to-reason-with-llms/).
+
+You can see how much the performance has improved compared to general models, ushering in a new era.
+
+{{< echarts >}} {
+  "tooltip": {
+    "trigger": "axis",
+    "axisPointer": {
+      "type": "shadow"
+    }
+  },
+  "legend": {
+    "top": 30,
+    "data": ["gpt4o", "o1 preview", "o1", "expert human"]
+  },
+  "grid": {
+    "left": "8%",
+    "right": "8%",
+    "bottom": "10%",
+    "containLabel": true
+  },
+  "xAxis": {
+    "type": "category",
+    "data": [
+      "AIME 2024\n(Competition Math)",
+      "Codeforces\n(Competition Code)",
+      "GPQA Diamond\n(PhD-Level Science)"
+    ],
+    "axisLabel": {
+      "interval": 0
+    }
+  },
+  "yAxis": {
+    "type": "value",
+    "min": 0,
+    "max": 100,
+    "name": "Accuracy / Percentile (%)",
+    "nameGap": 32,
+    "nameLocation": "center"
+  },
+  "series": [
+    {
+      "name": "gpt4o",
+      "type": "bar",
+      "data": [13.4, 11.0, 56.1],
+      "barGap": "0",
+      "label": {
+        "show": true,
+        "position": "top"
+      }
+    },
+    {
+      "name": "o1 preview",
+      "type": "bar",
+      "data": [56.7, 62.0, 78.3],
+      "label": {
+        "show": true,
+        "position": "top"
+      }
+    },
+    {
+      "name": "o1",
+      "type": "bar",
+      "data": [83.3, 89.0, 78.0],
+      "label": {
+        "show": true,
+        "position": "top"
+      }
+    },
+    {
+      "name": "expert human",
+      "type": "bar",
+      "data": [null, null, 69.7],
+      "label": {
+        "show": true,
+        "position": "top"
+      }
+    }
+  ]
+}
+{{< /echarts >}}
+
+DeepSeek R1 was the first open-source reasoning model that could rival o1's performance. It also ushered in a new era, with similar training methods being applied to many models and inspiring new research. The open-source community finally had an o1-level model.
+
+### Non-reasoning Models
+
+After the introduction of reasoning models, models that don't perform reasoning before generating responses were categorized as non-reasoning models. Their performance in STEM fields is significantly worse than that of reasoning models. However, they offer good cost-effectiveness because CoT reasoning can be lengthy, leading to high costs. Additionally, reasoning models don't show significant advantages over non-reasoning models in literary tasks, and waiting for the model to perform reasoning isn't always acceptable in all situations.
+
+### Hybrid Models
+
+Is there a way to combine the performance of reasoning models with the cost-effectiveness of non-reasoning models? Anthropic's answer is Claude 3.7 Sonnet, the first hybrid model. It can switch between reasoning and non-reasoning modes and allows specifying a budget for CoT Tokens, enabling better cost control.
+
+{{< echarts >}} {
+  "tooltip": {
+    "trigger": "axis",
+    "axisPointer": {
+      "type": "shadow"
+    }
+  },
+  "legend": {
+    "top": 30,
+    "data": [
+      "Claude 3.7 Sonnet (ext)",
+      "Claude 3.7 Sonnet",
+      "OpenAI o1",
+      "DeepSeek R1"
+    ]
+  },
+  "grid": {
+    "left": "8%",
+    "right": "8%",
+    "bottom": "10%",
+    "containLabel": true
+  },
+  "xAxis": {
+    "type": "category",
+    "data": [
+      "GPQA\nDiamond",
+      "SWE-bench\nVerified",
+      "MMALU",
+      "IFEval",
+      "MATH 500",
+      "AIME 2024"
+    ],
+    "axisLabel": {
+      "interval": 0
+    }
+  },
+  "yAxis": {
+    "type": "value",
+    "min": 0,
+    "max": 100,
+    "name": "Accuracy / Percentile (%)",
+    "nameGap": 32,
+    "nameLocation": "center"
+  },
+  "series": [
+    {
+      "name": "Claude 3.7 Sonnet (ext)",
+      "type": "bar",
+      "data": [84.8, null, 86.1, 93.2, 96.2, 80.0],
+      "label": {
+        "show": true,
+        "position": "top",
+        "fontSize": 10
+      }
+    },
+    {
+      "name": "Claude 3.7 Sonnet",
+      "type": "bar",
+      "data": [68.0, 70.3, 83.2, 90.8, 82.2, 23.3],
+      "label": {
+        "show": true,
+        "position": "top",
+        "fontSize": 10
+      }
+    },
+    {
+      "name": "OpenAI o1",
+      "type": "bar",
+      "data": [78.0, 48.9, 87.7, null, 96.4, 83.3],
+      "label": {
+        "show": true,
+        "position": "top",
+        "fontSize": 10
+      }
+    },
+    {
+      "name": "DeepSeek R1",
+      "type": "bar",
+      "data": [71.5, 49.2, 79.5, 83.3, 97.3, 79.8],
+      "label": {
+        "show": true,
+        "position": "top",
+        "fontSize": 10
+      }
+    }
+  ]
+}
+{{< /echarts >}}
+
+Later, some open-source models adopted this concept, such as Cogito v1 Preview. It enables reasoning mode by adding "Enable deep thinking subroutine." to the beginning of the System Prompt.
+
+## Model Features
+
+LLMs are introducing more and more new features, some of which are powerful and trendy.
+
+### Function Calling
+
+We want to expand the model's capabilities, such as the ability to call external tools. Here's a sample request:
+
+```bash
+curl https://api.openai.com/v1/chat/completions \
+-H "Content-Type: application/json" \
+-H "Authorization: Bearer $OPENAI_API_KEY" \
+-d '{
+    "model": "gpt-4.1",
+    "messages": [
+        {
+            "role": "user",
+            "content": "Can you send an email to ilan@example.com and katia@example.com saying hi?"
+        }
+    ],
+    "tools": [
+        {
+            "type": "function",
+            "function": {
+                "name": "send_email",
+                "description": "Send an email to a given recipient with a subject and message.",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "to": {
+                            "type": "string",
+                            "description": "The recipient email address."
+                        },
+                        "subject": {
+                            "type": "string",
+                            "description": "Email subject line."
+                        },
+                        "body": {
+                            "type": "string",
+                            "description": "Body of the email message."
+                        }
+                    },
+                    "required": [
+                        "to",
+                        "subject",
+                        "body"
+                    ],
+                    "additionalProperties": false
+                },
+                "strict": true
+            }
+        }
+    ]
+}'
+```
+
+You would receive a model response containing a call to the previously defined tool:
+
+```json
+{
+  "id": "chatcmpl-BPy4wVm9NJBqduSwkHF3FKtyvBRF0",
+  "object": "chat.completion",
+  "created": 1745528842,
+  "model": "gpt-4.1-2025-04-14",
+  "choices": [
+    {
+      "index": 0,
+      "message": {
+        "role": "assistant",
+        "content": null,
+        "tool_calls": [
+          {
+            "id": "call_RLgC1PNJou9nepd5zB6rh3Yj",
+            "type": "function",
+            "function": {
+              "name": "send_email",
+              "arguments": "{\"to\": \"ilan@example.com\", \"subject\": \"Hello!\", \"body\": \"Hi Ilan,\"}"
+            }
+          },
+          {
+            "id": "call_zum2F3ZHdlv0ECZQ1Tur4zyZ",
+            "type": "function",
+            "function": {
+              "name": "send_email",
+              "arguments": "{\"to\": \"katia@example.com\", \"subject\": \"Hello!\", \"body\": \"Hi Katia,\"}"
+            }
+          }
+        ],
+        "refusal": null,
+        "annotations": []
+      },
+      "logprobs": null,
+      "finish_reason": "tool_calls"
+    }
+  ],
+  "usage": {
+    "prompt_tokens": 91,
+    "completion_tokens": 74,
+    "total_tokens": 165,
+    "prompt_tokens_details": {
+      "cached_tokens": 0,
+      "audio_tokens": 0
+    },
+    "completion_tokens_details": {
+      "reasoning_tokens": 0,
+      "audio_tokens": 0,
+      "accepted_prediction_tokens": 0,
+      "rejected_prediction_tokens": 0
+    }
+  },
+  "service_tier": "default",
+  "system_fingerprint": "fp_a1102cf978"
+}
+```
+
+However, the process isn't complete here because the model doesn't execute the called tools. You need to execute the tool calls and return the results to the model. The complete flow is similar to:
+
+```mermaid
+sequenceDiagram
+    participant Dev as Developer
+    participant LLM as Model
+
+    Note over Dev,LLM: 1: Tool Definitions + Messages
+    Dev->>LLM: get_weather(location)<br>What's the weather in Paris?
+
+    Note over Dev,LLM: 2: Tool Calls
+    LLM-->>Dev: get_weather("paris")
+
+    Note over Dev: 3: Execute Function Code
+    Dev->>Dev: get_weather("paris")<br>{"temperature": 14}
+
+    Note over Dev,LLM: 4: Results
+    Dev->>LLM: All Prior Messages<br>{"temperature": 14}
+
+    Note over Dev,LLM: 5: Final Response
+    LLM-->>Dev: It's currently 14°C in Paris.
+```
+
+For more details, you can check OpenAI's example: [Function calling](https://platform.openai.com/docs/guides/function-calling?api-mode=chat&lang=curl&example=get-weather&strict-mode=enabled).
+
+### Structured Outputs
+
+This refers to the literal meaning, allowing the model to return responses in predefined JSON formats. You can check OpenAI's example: [Structured Outputs](https://platform.openai.com/docs/guides/structured-outputs?api-mode=chat&lang=python).
+
+For example, here's an example using Pydantic from SGLang's documentation:
+
+```python
+from pydantic import BaseModel, Field
+
+# Define the schema using Pydantic
+class CapitalInfo(BaseModel):
+    name: str = Field(..., pattern=r"^\w+$", description="Name of the capital city")
+    population: int = Field(..., description="Population of the capital city")
+
+response = client.chat.completions.create(
+    model="meta-llama/Meta-Llama-3.1-8B-Instruct",
+    messages=[
+        {
+            "role": "user",
+            "content": "Please generate the information of the capital of France in the JSON format.",
+        },
+    ],
+    temperature=0,
+    max_tokens=128,
+    response_format={
+        "type": "json_schema",
+        "json_schema": {
+            "name": "foo",
+            # convert the pydantic model to json schema
+            "schema": CapitalInfo.model_json_schema(),
+        },
+    },
+)
+
+response_content = response.choices[0].message.content
+# validate the JSON response by the pydantic model
+capital_info = CapitalInfo.model_validate_json(response_content)
+print_highlight(f"Validated response: {capital_info.model_dump_json()}")
+```
+
+The response might look like this:
+
+```console
+Validated response: {"name":"Paris","population":2147000}
+```
+
+You can also use JSON directly:
+
+```python
+import json
+
+json_schema = json.dumps(
+    {
+        "type": "object",
+        "properties": {
+            "name": {"type": "string", "pattern": "^[\\w]+$"},
+            "population": {"type": "integer"},
+        },
+        "required": ["name", "population"],
+    }
+)
+
+response = client.chat.completions.create(
+    model="meta-llama/Meta-Llama-3.1-8B-Instruct",
+    messages=[
+        {
+            "role": "user",
+            "content": "Give me the information of the capital of France in the JSON format.",
+        },
+    ],
+    temperature=0,
+    max_tokens=128,
+    response_format={
+        "type": "json_schema",
+        "json_schema": {"name": "foo", "schema": json.loads(json_schema)},
+    },
+)
+
+print_highlight(response.choices[0].message.content)
+```
+
+The response might look like this:
+
+```console
+{"name": "Paris", "population": 2147000}
+```
+
+Of course, Structured Outputs are not limited to JSON, but this is just a starting point.
--- a/content/en/posts/llama-cpp/common-terms/svg_generator.py
+++ b/content/en/posts/llama-cpp/common-terms/svg_generator.py
@@ -0,0 +1,164 @@
+import sys
+import os
+import html
+import unicodedata
+from typing import List, Tuple
+
+def is_cjk(char):
+    """
+    检查字符是否是中日韩文字或标点符号
+    
+    东亚字符宽度标记:
+    - W: 宽字符 (如汉字)
+    - F: 全角字符
+    - A: 歧义字符 (部分中文标点)
+    """
+    east_asian_width = unicodedata.east_asian_width(char)
+    return east_asian_width in ('W', 'F', 'A')
+    
+def is_cjk_punct(char):
+    """检查字符是否是中文标点符号"""
+    # 常见中文标点符号的Unicode范围
+    cjk_punct_ranges = [
+        (0x3000, 0x303F),  # CJK标点符号
+        (0xFF00, 0xFFEF),  # 全角ASCII、全角标点
+    ]
+    
+    code = ord(char)
+    return any(start <= code <= end for start, end in cjk_punct_ranges)
+
+def calculate_width(text: str) -> int:
+    """
+    根据文本内容计算矩形宽度
+    
+    Args:
+        text: 要计算宽度的文本
+    
+    Returns:
+        像素宽度
+    """
+    if len(text) <= 1:
+        return 40
+    
+    # 计算中文字符和中文标点符号数量
+    cjk_count = sum(1 for char in text if is_cjk(char) or is_cjk_punct(char))
+    latin_count = len(text) - cjk_count
+    
+    # 根据示例近似计算宽度
+    width = cjk_count * 26 + latin_count * 9.5
+    
+    # 添加内边距
+    padding = 12
+    return width + padding
+
+def generate_svg(filename: str, texts: List[str]) -> str:
+    """
+    生成带有交替颜色圆角矩形的SVG图像
+    
+    Args:
+        filename: 保存SVG的文件名
+        texts: 要在矩形中显示的文本列表
+    
+    Returns:
+        SVG内容字符串
+    """
+    # 固定参数
+    rect_height = 40
+    y_pos = 10
+    
+    # 根据文本长度计算每个矩形的宽度
+    rect_widths = [calculate_width(text) for text in texts]
+    
+    # 初始化位置
+    positions = []
+    current_x = 10
+    
+    for width in rect_widths:
+        positions.append(current_x)
+        current_x += width + 5  # 矩形之间5px间距
+    
+    # 计算SVG总宽度
+    total_width = positions[-1] + rect_widths[-1] + 10
+    
+    # 创建SVG
+    svg = f'<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 {total_width} 60" width="{total_width}" height="60">'
+    svg += f'<path fill="none" d="M0 0h{total_width}v60H0z"/>'
+    
+    # 添加矩形和文本
+    for i, (text, x_pos, width) in enumerate(zip(texts, positions, rect_widths)):
+        # 转义文本中的特殊字符
+        escaped_text = html.escape(text)
+        
+        # 交替颜色
+        fill_color = "#ffec99" if i % 2 == 0 else "#a5d8ff"
+        
+        # 添加矩形
+        svg += f'<rect x="{x_pos}" y="{y_pos}" width="{width}" height="{rect_height}" rx="10" ry="10" '
+        svg += f'fill="{fill_color}" stroke="#000" stroke-width="1.5"/>'
+        
+        # 添加文本
+        text_x = x_pos + width / 2
+        text_y = y_pos + rect_height / 2 + 7.5  # 垂直居中（近似值）
+        svg += f'<text x="{text_x}" y="{text_y}" font-family="Arial, sans-serif" font-size="20" text-anchor="middle">{escaped_text}</text>'
+    
+    # 关闭SVG
+    svg += '</svg>'
+    
+    return svg
+
+def parse_input(input_str: str) -> Tuple[str, List[str]]:
+    """
+    解析格式为：filename|text1|text2|...|textN 的输入字符串
+    
+    Args:
+        input_str: 按要求格式的输入字符串
+    
+    Returns:
+        (filename, [text1, text2, ..., textN]) 元组
+    """
+    parts = input_str.strip().split('|')
+    if len(parts) < 2:
+        raise ValueError("输入必须至少包含一个文件名和一个文本元素，用'|'分隔")
+    
+    filename = parts[0]
+    texts = parts[1:]
+    
+    return filename, texts
+
+def main():
+    """处理输入并生成SVG的主函数"""
+    print("SVG 生成器工具")
+    print("==============")
+    print("此工具生成带有交替颜色圆角矩形的SVG图像。")
+    print("输入格式: 文件名|文本1|文本2|...|文本N")
+    print("示例: output|你好|我今天能怎么帮助你|1+1等于几|2")
+    print()
+    
+    if len(sys.argv) > 1:
+        # 从命令行参数获取输入
+        input_str = sys.argv[1]
+    else:
+        # 从标准输入获取输入
+        print("请按格式输入: 文件名|文本1|文本2|...|文本N")
+        input_str = input().strip()
+    
+    try:
+        filename, texts = parse_input(input_str)
+        svg_content = generate_svg(filename, texts)
+        
+        # 如果没有.svg扩展名，添加它
+        if not filename.lower().endswith('.svg'):
+            filename += '.svg'
+        
+        # 保存到文件
+        with open(filename, 'w', encoding='utf-8') as f:
+            f.write(svg_content)
+        
+        print(f"SVG图像已保存到 {filename}")
+        
+    except Exception as e:
+        print(f"错误: {e}")
+        print("请检查输入格式并重试。")
+
+if __name__ == "__main__":
+    main()
--- a/content/zh-cn/posts/llama-cpp/common-terms/content.svg
+++ b/content/zh-cn/posts/llama-cpp/common-terms/content.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 455 60" width="455" height="60"><path fill="none" d="M0 0h455v60H0z"/><rect x="10" y="10" width="60" height="40" rx="10" ry="10" fill="#ffec99" stroke="#000" stroke-width="1.5"/><text x="40" y="37.5" font-family="Arial, sans-serif" font-size="20" text-anchor="middle">你好</text><rect x="75" y="10" width="200" height="40" rx="10" ry="10" fill="#a5d8ff" stroke="#000" stroke-width="1.5"/><text x="175" y="37.5" font-family="Arial, sans-serif" font-size="20" text-anchor="middle">我今天能怎么帮助你</text><rect x="280" y="10" width="120" height="40" rx="10" ry="10" fill="#ffec99" stroke="#000" stroke-width="1.5"/><text x="340" y="37.5" font-family="Arial, sans-serif" font-size="20" text-anchor="middle">1+1等于几</text><rect x="405" y="10" width="40" height="40" rx="10" ry="10" fill="#a5d8ff" stroke="#000" stroke-width="1.5"/><text x="425" y="37.5" font-family="Arial, sans-serif" font-size="20" text-anchor="middle">2</text></svg>
--- a/content/zh-cn/posts/llama-cpp/common-terms/hello-hello-1plus1-equal2.svg
+++ b/content/zh-cn/posts/llama-cpp/common-terms/hello-hello-1plus1-equal2.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 666.5 60" width="666.5" height="60"><path fill="none" d="M0 0h666.5v60H0z"/><rect x="10" y="10" width="69.0" height="40" rx="10" ry="10" fill="#ffec99" stroke="#000" stroke-width="1.5"/><text x="44.5" y="37.5" font-family="Arial, sans-serif" font-size="20" text-anchor="middle">Hello!</text><rect x="84.0" y="10" width="316.0" height="40" rx="10" ry="10" fill="#a5d8ff" stroke="#000" stroke-width="1.5"/><text x="242.0" y="37.5" font-family="Arial, sans-serif" font-size="20" text-anchor="middle">Hello! How can I help you today?</text><rect x="405.0" y="10" width="118.5" height="40" rx="10" ry="10" fill="#ffec99" stroke="#000" stroke-width="1.5"/><text x="464.25" y="37.5" font-family="Arial, sans-serif" font-size="20" text-anchor="middle">1+1等于几</text><rect x="528.5" y="10" width="128.0" height="40" rx="10" ry="10" fill="#a5d8ff" stroke="#000" stroke-width="1.5"/><text x="592.5" y="37.5" font-family="Arial, sans-serif" font-size="20" text-anchor="middle">1+1等于2。</text></svg>
--- a/content/zh-cn/posts/llama-cpp/common-terms/hello-hello-1plus1.svg
+++ b/content/zh-cn/posts/llama-cpp/common-terms/hello-hello-1plus1.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 533.5 60" width="533.5" height="60"><path fill="none" d="M0 0h533.5v60H0z"/><rect x="10" y="10" width="69.0" height="40" rx="10" ry="10" fill="#ffec99" stroke="#000" stroke-width="1.5"/><text x="44.5" y="37.5" font-family="Arial, sans-serif" font-size="20" text-anchor="middle">Hello!</text><rect x="84.0" y="10" width="316.0" height="40" rx="10" ry="10" fill="#a5d8ff" stroke="#000" stroke-width="1.5"/><text x="242.0" y="37.5" font-family="Arial, sans-serif" font-size="20" text-anchor="middle">Hello! How can I help you today?</text><rect x="405.0" y="10" width="118.5" height="40" rx="10" ry="10" fill="#ffec99" stroke="#000" stroke-width="1.5"/><text x="464.25" y="37.5" font-family="Arial, sans-serif" font-size="20" text-anchor="middle">1+1等于几</text></svg>
--- a/content/zh-cn/posts/llama-cpp/common-terms/hello-hello.svg
+++ b/content/zh-cn/posts/llama-cpp/common-terms/hello-hello.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 410.0 60" width="410.0" height="60"><path fill="none" d="M0 0h410.0v60H0z"/><rect x="10" y="10" width="69.0" height="40" rx="10" ry="10" fill="#ffec99" stroke="#000" stroke-width="1.5"/><text x="44.5" y="37.5" font-family="Arial, sans-serif" font-size="20" text-anchor="middle">Hello!</text><rect x="84.0" y="10" width="316.0" height="40" rx="10" ry="10" fill="#a5d8ff" stroke="#000" stroke-width="1.5"/><text x="242.0" y="37.5" font-family="Arial, sans-serif" font-size="20" text-anchor="middle">Hello! How can I help you today?</text></svg>
--- a/content/zh-cn/posts/llama-cpp/common-terms/hello.svg
+++ b/content/zh-cn/posts/llama-cpp/common-terms/hello.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 89.0 60" width="89.0" height="60"><path fill="none" d="M0 0h89.0v60H0z"/><rect x="10" y="10" width="69.0" height="40" rx="10" ry="10" fill="#ffec99" stroke="#000" stroke-width="1.5"/><text x="44.5" y="37.5" font-family="Arial, sans-serif" font-size="20" text-anchor="middle">Hello!</text></svg>
--- a/content/zh-cn/posts/llama-cpp/common-terms/index.md
+++ b/content/zh-cn/posts/llama-cpp/common-terms/index.md
@@ -0,0 +1,666 @@
+---
+title: 大语言模型常见术语，概念与解释
+subtitle:
+date: 2025-04-15T17:22:42-04:00
+lastmod: 2025-04-15T17:22:42-04:00
+slug: common-terms
+draft: false
+author:
+  name: James
+  link: https://www.jamesflare.com
+  email:
+  avatar: /site-logo.avif
+description: 这篇博客文章解释了与大型语言模型（LLM）相关的常见术语和概念，包括上下文窗口、温度以及多模态和推理模型等各种模型类型。它还涵盖了实际应用和API使用示例。
+keywords: ["大语言模型","LLM","AI模型","AI术语","AI概念","LLM术语","context window","temperature","多模态模型","推理模型"]
+license:
+comment: true
+weight: 0
+tags:
+  - 大语言模型
+  - llama.cpp
+  - Ollama
+categories:
+  - 大语言模型
+collections:
+  - LLM
+hiddenFromHomePage: false
+hiddenFromSearch: false
+hiddenFromRss: false
+hiddenFromRelated: false
+summary: 这篇博客文章解释了与大型语言模型（LLM）相关的常见术语和概念，包括上下文窗口、温度以及多模态和推理模型等各种模型类型。它还涵盖了实际应用和API使用示例。
+resources:
+  - name: featured-image
+    src: featured-image.jpg
+  - name: featured-image-preview
+    src: featured-image-preview.jpg
+toc: true
+math: false
+lightgallery: false
+password:
+message:
+repost:
+  enable: false
+  url:
+
+# See details front matter: https://fixit.lruihao.cn/documentation/content-management/introduction/#front-matter
+---
+
+<!--more-->
+
+## 模型是怎么生成回答的？
+
+现在的主流模型用的都是自回归生成，它会根据上文生成下文。模型本身没有记忆，也没有真的在和你对话，而是这样的一个情况
+
+{{< image src="content.svg" caption="Content Window" >}}
+
+每一次对话都在增加这个上下文长度，直到它超过一个上限，超过这个上限后，要么丢弃内容，要么压缩总结。这个上限就叫上下文窗口。
+
+在一些情况下，上下文窗口不够大就没法有效处理一些内容。Google 的 Gemini 2.5 Pro 支持 100 万的上下文长度，OpenAI 新发布的 GPT-4.1 也支持最长 100 万的上下文长度，o3 有 20 万的上下文长度。
+
+### Temperature
+
+温度是 LLM 中的一个参数，它影响生成文本的随机性和创造性。较低的温度，如 0.2，会使输出更加集中和可预测，适合需要准确性的任务，如技术写作。较高的温度，如 0.7，会使输出更加多样化和富有创造性，适合讲故事或头脑风暴。
+
+可以看看 IBM 写的 [What is LLM Temperature?](https://www.ibm.com/think/topics/llm-temperature) 是一篇不错的文章。
+
+### Top K 以及 Top P
+
+Top K 将下一个 Token 的选择限制在模型输出中概率最高的前 K 个选项。例如，如果 K 设置为 5，模型只会考虑最有可能的 5 个 Token ，使生成的文本更加连贯和可预测。这有助于减少随机性，但将 K 设置得太低可能会限制创造力。
+
+Top P（也称为核心采样）通过选择累积概率超过指定阈值（例如 0.9）的最小一组 Token 来工作。这意味着它包含所有共同占 90% 概率质量的标记，从而在避免非常低概率、可能无意义的标记的同时，允许产生更多样化的输出。它通常被认为比 Top K 更灵活。
+
+有一篇文章写得很好 [How to generate text: using different decoding methods for language generation with Transformers](https://huggingface.co/blog/how-to-generate) 可以看看。
+
+### 小小实践
+
+如果你还没完全理解上下文的概念，先看这个请求
+
+{{< image src="hello.svg" caption="Hello!" >}}
+
+```bash
+curl https://api.openai.com/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer $OPENAI_API_KEY" \
+  -d '{
+    "model": "gpt-4.1",
+    "messages": [
+      {
+        "role": "developer",
+        "content": "You are a helpful assistant."
+      },
+      {
+        "role": "user",
+        "content": "Hello!"
+      }
+    ],
+    "temperature": 0.7
+  }'
+```
+
+它向模型`gpt-4.1`发起了一个消息`Hello!`。它的系统提示词是`You are a helpful assistant.`，`temperature`为`0.7`。
+
+> [!Tip]
+> 其中，`developer`是一个新的 role。它在 o1 以及之后的模型中替代原先的`system`。
+
+你应该会得到这样的一个响应
+
+{{< image src="hello-hello.svg" caption="Hello! How can I help you today?" >}}
+
+```json
+{
+  "id": "chatcmpl-BPctVve9zkK7PuUsFvpWbsWzWdR0b",
+  "object": "chat.completion",
+  "created": 1745447409,
+  "model": "gpt-4.1-2025-04-14",
+  "choices": [
+    {
+      "index": 0,
+      "message": {
+        "role": "assistant",
+        "content": "Hello! How can I help you today?",
+        "refusal": null,
+        "annotations": []
+      },
+      "logprobs": null,
+      "finish_reason": "stop"
+    }
+  ],
+  "usage": {
+    "prompt_tokens": 19,
+    "completion_tokens": 10,
+    "total_tokens": 29,
+    "prompt_tokens_details": {
+      "cached_tokens": 0,
+      "audio_tokens": 0
+    },
+    "completion_tokens_details": {
+      "reasoning_tokens": 0,
+      "audio_tokens": 0,
+      "accepted_prediction_tokens": 0,
+      "rejected_prediction_tokens": 0
+    }
+  },
+  "service_tier": "default",
+  "system_fingerprint": "fp_b38e740b47"
+}
+```
+
+而多轮对话会像这样发起请求。
+
+{{< image src="hello-hello-1plus1.svg" caption="1+1等于几" >}}
+
+```bash
+curl https://api.openai.com/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer $OPENAI_API_KEY" \
+  -d '{
+    "model": "gpt-4.1",
+    "messages": [
+      {
+        "role": "developer",
+        "content": "You are a helpful assistant."
+      },
+      {
+        "role": "user",
+        "content": "Hello!"
+      },
+      {
+        "role": "assistant",
+        "content": "Hello! How can I help you today?"
+      },
+      {
+        "role": "user",
+        "content": "1+1等于几"
+      }
+    ],
+    "temperature": 0.7
+  }'
+```
+
+我们也会收到一个响应
+
+{{< image src="hello-hello-1plus1-equal2.svg" caption="1+1等于2。" >}}
+
+```json
+{
+  "id": "chatcmpl-BPcv4alGyqzbbwN9ZBzXVXUcX7CWS",
+  "object": "chat.completion",
+  "created": 1745447506,
+  "model": "gpt-4.1-2025-04-14",
+  "choices": [
+    {
+      "index": 0,
+      "message": {
+        "role": "assistant",
+        "content": "1+1等于2。",
+        "refusal": null,
+        "annotations": []
+      },
+      "logprobs": null,
+      "finish_reason": "stop"
+    }
+  ],
+  "usage": {
+    "prompt_tokens": 42,
+    "completion_tokens": 8,
+    "total_tokens": 50,
+    "prompt_tokens_details": {
+      "cached_tokens": 0,
+      "audio_tokens": 0
+    },
+    "completion_tokens_details": {
+      "reasoning_tokens": 0,
+      "audio_tokens": 0,
+      "accepted_prediction_tokens": 0,
+      "rejected_prediction_tokens": 0
+    }
+  },
+  "service_tier": "default",
+  "system_fingerprint": "fp_a1102cf978"
+}
+```
+
+这基本上就是API基本的作用方式了，相信你也对对话形式有了更深的了解。
+
+### 常见疑问
+
+> **我能对话多少轮？**
+>
+> 不少用户侧产品都会限制用户的对话数量，不同等级有不同的配额。如果你理解了上面的内容，你会发现。API是每次请求收费的，和上下文长度有直接关系，和对话轮数没有直接关系。也就是说，如果想最大化发挥配额的价值，那你应该仔细构建提示词，争取在几轮对话内完成任务。另外即便模型一样，不同级别套餐的Content Window也未必是一样的。
+
+> **这个模型支持文件上传吗？**
+>
+> 不少用户会上传PPT，PDF等文件给模型。但是我想提醒的是，模型本身可能并没法原生读取这些文件。大多数模型只能处理文本输入输出，如果能处理多种输入输出的我们叫做多模态模型。现在主流模型都支持图像输入，也是多模态模型。一些前言模型支持语音，视频输入输出。你可能注意到了，它们都不支持所谓的PPT，PDF文件。  
+> 事实上，文件上传是各家的独自实现。本质是把它解析成文本内容，也许有图像内容。如果你不清楚它的原理，你可以看看[Mathpix](https://mathpix.com/)的服务，它给多家公司提供了PDF的解析服务。它可以把PDF解析成Markdown之类的文本格式，然后附带在给LLM的请求里实现文档理解和QA。所以，如果你讲究一点的话，可以把PDF里的文本内容手动粘贴在请求里，效果不会更差，甚至会更好。
+
+> **有时文本长度明显超过了窗口限制，为什么还能对话？**
+>
+> 这也是一种优化技巧，早期GPT-3.5的上下文窗口只有4096 Token，很快就满了。这时候会让模型总结上文来压缩上下文长度，除了这招，还有一些高端一点的操作，比如RAG这样的检索增强技术。不过性能是绝对不如原生窗口的，，而且随着对话轮数增多，模型性能也会下降。这在早期的LLM上非常明显，经过这些年行业的改进，多轮对话性能已经有了长足的提升了。
+
+## 细分的模型类型
+
+在模型不断的发展过程中，有很多新技术，新特性被引入。它们有什么区别，又代表了什么呢？
+
+### 多模态模型 Muti-model
+
+如果一个模型只能处理文本输入，同时输出文本，那它就是单模态模型。反之就是多模态模型。比如当初的GPT-4，它支持文字和图像输入，文本输出。首次给LLM引入了图像理解能力，也就是多模态模型。
+
+现在我们希望模型有更多模态，比如语音输入，视频输入，图像输入，以及语音输出，图像生成。这些也在最新发布的模型中越来越常见。
+
+### 推理模型 Reasoning Model
+
+很早之前，人们就发现，如果在请求里加上诸如“请一步一步思考”，就可以显著提升LLM回答的正确率。
+
+OpenAI发布的o1是第一个真正大规模商用的Reasoning Model。它和其它模型的区别就是它会在生成回答之前进行CoT（Chain-of-Thought）推理。对STEM理科方面的性能提升特别明显，比如数学。对复杂任务的性能提升也特别显著。OpenAI写了篇文章来讲述它的性能提升[Learning to reason with LLMs](https://openai.com/index/learning-to-reason-with-llms/)。
+
+可以看到性能相比一般模型的提升有多么巨大，它开创了一个新时代。
+
+{{< echarts >}} {
+  "tooltip": {
+    "trigger": "axis",
+    "axisPointer": {
+      "type": "shadow"
+    }
+  },
+  "legend": {
+    "top": 30,
+    "data": ["gpt4o", "o1 preview", "o1", "expert human"]
+  },
+  "grid": {
+    "left": "8%",
+    "right": "8%",
+    "bottom": "10%",
+    "containLabel": true
+  },
+  "xAxis": {
+    "type": "category",
+    "data": [
+      "AIME 2024\n(Competition Math)",
+      "Codeforces\n(Competition Code)",
+      "GPQA Diamond\n(PhD-Level Science)"
+    ],
+    "axisLabel": {
+      "interval": 0
+    }
+  },
+  "yAxis": {
+    "type": "value",
+    "min": 0,
+    "max": 100,
+    "name": "Accuracy / Percentile (%)",
+    "nameGap": 32,
+    "nameLocation": "center"
+  },
+  "series": [
+    {
+      "name": "gpt4o",
+      "type": "bar",
+      "data": [13.4, 11.0, 56.1],
+      "barGap": "0",
+      "label": {
+        "show": true,
+        "position": "top"
+      }
+    },
+    {
+      "name": "o1 preview",
+      "type": "bar",
+      "data": [56.7, 62.0, 78.3],
+      "label": {
+        "show": true,
+        "position": "top"
+      }
+    },
+    {
+      "name": "o1",
+      "type": "bar",
+      "data": [83.3, 89.0, 78.0],
+      "label": {
+        "show": true,
+        "position": "top"
+      }
+    },
+    {
+      "name": "expert human",
+      "type": "bar",
+      "data": [null, null, 69.7],
+      "label": {
+        "show": true,
+        "position": "top"
+      }
+    }
+  ]
+}
+{{< /echarts >}}
+
+而DeepSeek R1则是第一个能媲美o1的开源推理模型。它也开创了一个新时代，类似的训练方法用在了许多模型上，也启发了新研究。开源社区首次有了o1级别的模型。
+
+### 非推理模型 Non-reasoning Model
+
+有了推理模型后，为了方便区分就把回答之前不进行推理的模型归类为非推理模型。它们在STEM领域的性能显著不如推理模型。但是它们有很好的成本效益，因为推理的CoT有时候会很长，从而带来巨大的成本。并且推理模型相比非推理模型在文学方面并没有显著优势。而且等待模型进行推理的时间也不是在哪里都可以接受的。
+
+### 混合模型 Hybrid Model
+
+那有没有办法综合推理模型的性能以及非推理模型的成本效益优势呢？Anthropic给出的答案是Claude 3.7 Sonnet，首个混合模型。也就是能在非推理和非推理之间进行切换。而且还能给模型指定CoT Token的预算，实现更好的成本控制。
+
+{{< echarts >}} {
+  "tooltip": {
+    "trigger": "axis",
+    "axisPointer": {
+      "type": "shadow"
+    }
+  },
+  "legend": {
+    "top": 30,
+    "data": [
+      "Claude 3.7 Sonnet (ext)",
+      "Claude 3.7 Sonnet",
+      "OpenAI o1",
+      "DeepSeek R1"
+    ]
+  },
+  "grid": {
+    "left": "8%",
+    "right": "8%",
+    "bottom": "10%",
+    "containLabel": true
+  },
+  "xAxis": {
+    "type": "category",
+    "data": [
+      "GPQA\nDiamond",
+      "SWE-bench\nVerified",
+      "MMALU",
+      "IFEval",
+      "MATH 500",
+      "AIME 2024"
+    ],
+    "axisLabel": {
+      "interval": 0
+    }
+  },
+  "yAxis": {
+    "type": "value",
+    "min": 0,
+    "max": 100,
+    "name": "Accuracy / Percentile (%)",
+    "nameGap": 32,
+    "nameLocation": "center"
+  },
+  "series": [
+    {
+      "name": "Claude 3.7 Sonnet (ext)",
+      "type": "bar",
+      "data": [84.8, null, 86.1, 93.2, 96.2, 80.0],
+      "label": {
+        "show": true,
+        "position": "top",
+        "fontSize": 10
+      }
+    },
+    {
+      "name": "Claude 3.7 Sonnet",
+      "type": "bar",
+      "data": [68.0, 70.3, 83.2, 90.8, 82.2, 23.3],
+      "label": {
+        "show": true,
+        "position": "top",
+        "fontSize": 10
+      }
+    },
+    {
+      "name": "OpenAI o1",
+      "type": "bar",
+      "data": [78.0, 48.9, 87.7, null, 96.4, 83.3],
+      "label": {
+        "show": true,
+        "position": "top",
+        "fontSize": 10
+      }
+    },
+    {
+      "name": "DeepSeek R1",
+      "type": "bar",
+      "data": [71.5, 49.2, 79.5, 83.3, 97.3, 79.8],
+      "label": {
+        "show": true,
+        "position": "top",
+        "fontSize": 10
+      }
+    }
+  ]
+}
+{{< /echarts >}}
+
+后来也有一些开源模型采用了这样的理念，比如Cogito v1 Preview。它通过在System Prompt开头加一句“Enable deep thinking subroutine.”以开启推理模式。
+
+## 模型特性
+
+LLM也在引入越来越多的新特性，有些很强大，也很时髦。
+
+### Function Calling
+
+我们希望扩展模型的能力，例如调用外部工具的能力。发出这样的一个请求
+
+```bash
+curl https://api.openai.com/v1/chat/completions \
+-H "Content-Type: application/json" \
+-H "Authorization: Bearer $OPENAI_API_KEY" \
+-d '{
+    "model": "gpt-4.1",
+    "messages": [
+        {
+            "role": "user",
+            "content": "Can you send an email to ilan@example.com and katia@example.com saying hi?"
+        }
+    ],
+    "tools": [
+        {
+            "type": "function",
+            "function": {
+                "name": "send_email",
+                "description": "Send an email to a given recipient with a subject and message.",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "to": {
+                            "type": "string",
+                            "description": "The recipient email address."
+                        },
+                        "subject": {
+                            "type": "string",
+                            "description": "Email subject line."
+                        },
+                        "body": {
+                            "type": "string",
+                            "description": "Body of the email message."
+                        }
+                    },
+                    "required": [
+                        "to",
+                        "subject",
+                        "body"
+                    ],
+                    "additionalProperties": false
+                },
+                "strict": true
+            }
+        }
+    ]
+}'
+```
+
+你会收到这样的模型返回，内容是调用先前我们定义的工具。
+
+```json
+{
+  "id": "chatcmpl-BPy4wVm9NJBqduSwkHF3FKtyvBRF0",
+  "object": "chat.completion",
+  "created": 1745528842,
+  "model": "gpt-4.1-2025-04-14",
+  "choices": [
+    {
+      "index": 0,
+      "message": {
+        "role": "assistant",
+        "content": null,
+        "tool_calls": [
+          {
+            "id": "call_RLgC1PNJou9nepd5zB6rh3Yj",
+            "type": "function",
+            "function": {
+              "name": "send_email",
+              "arguments": "{\"to\": \"ilan@example.com\", \"subject\": \"Hello!\", \"body\": \"Hi Ilan,\"}"
+            }
+          },
+          {
+            "id": "call_zum2F3ZHdlv0ECZQ1Tur4zyZ",
+            "type": "function",
+            "function": {
+              "name": "send_email",
+              "arguments": "{\"to\": \"katia@example.com\", \"subject\": \"Hello!\", \"body\": \"Hi Katia,\"}"
+            }
+          }
+        ],
+        "refusal": null,
+        "annotations": []
+      },
+      "logprobs": null,
+      "finish_reason": "tool_calls"
+    }
+  ],
+  "usage": {
+    "prompt_tokens": 91,
+    "completion_tokens": 74,
+    "total_tokens": 165,
+    "prompt_tokens_details": {
+      "cached_tokens": 0,
+      "audio_tokens": 0
+    },
+    "completion_tokens_details": {
+      "reasoning_tokens": 0,
+      "audio_tokens": 0,
+      "accepted_prediction_tokens": 0,
+      "rejected_prediction_tokens": 0
+    }
+  },
+  "service_tier": "default",
+  "system_fingerprint": "fp_a1102cf978"
+}
+```
+
+不过整个流程到这里还没结束，因为模型不会执行调用的工具，这需要你执行工具调用并且把内容返回给模型。完整流程类似于
+
+```mermaid
+sequenceDiagram
+    participant Dev as Developer
+    participant LLM as Model
+
+    Note over Dev,LLM: 1: Tool Definitions + Messages
+    Dev->>LLM: get_weather(location)<br>What's the weather in Paris?
+
+    Note over Dev,LLM: 2: Tool Calls
+    LLM-->>Dev: get_weather("paris")
+
+    Note over Dev: 3: Execute Function Code
+    Dev->>Dev: get_weather("paris")<br>{"temperature": 14}
+
+    Note over Dev,LLM: 4: Results
+    Dev->>LLM: All Prior Messages<br>{"temperature": 14}
+
+    Note over Dev,LLM: 5: Final Response
+    LLM-->>Dev: It's currently 14°C in Paris.
+```
+
+有兴趣可以看看OpenAI给出的示例[Function calling](https://platform.openai.com/docs/guides/function-calling?api-mode=chat&lang=curl&example=get-weather&strict-mode=enabled)。
+
+### Structured Outputs
+
+这个就是字面意思，可以让模型返回预先定义的JSON格式的响应。可以看看OpenAI给出的示例[Structured Outputs](https://platform.openai.com/docs/guides/structured-outputs?api-mode=chat&lang=python)。
+
+比如这段用Pydantic实现的例子，来自[SGLang的文档](https://docs.sglang.ai/backend/structured_outputs.html#JSON)。
+
+```python
+from pydantic import BaseModel, Field
+
+
+# Define the schema using Pydantic
+class CapitalInfo(BaseModel):
+    name: str = Field(..., pattern=r"^\w+$", description="Name of the capital city")
+    population: int = Field(..., description="Population of the capital city")
+
+
+response = client.chat.completions.create(
+    model="meta-llama/Meta-Llama-3.1-8B-Instruct",
+    messages=[
+        {
+            "role": "user",
+            "content": "Please generate the information of the capital of France in the JSON format.",
+        },
+    ],
+    temperature=0,
+    max_tokens=128,
+    response_format={
+        "type": "json_schema",
+        "json_schema": {
+            "name": "foo",
+            # convert the pydantic model to json schema
+            "schema": CapitalInfo.model_json_schema(),
+        },
+    },
+)
+
+response_content = response.choices[0].message.content
+# validate the JSON response by the pydantic model
+capital_info = CapitalInfo.model_validate_json(response_content)
+print_highlight(f"Validated response: {capital_info.model_dump_json()}")
+```
+
+返回大概是这样的
+
+```console
+Validated response: {"name":"Paris","population":2147000}
+```
+
+也可以直接使用JSON实现
+
+```python
+import json
+
+json_schema = json.dumps(
+    {
+        "type": "object",
+        "properties": {
+            "name": {"type": "string", "pattern": "^[\\w]+$"},
+            "population": {"type": "integer"},
+        },
+        "required": ["name", "population"],
+    }
+)
+
+response = client.chat.completions.create(
+    model="meta-llama/Meta-Llama-3.1-8B-Instruct",
+    messages=[
+        {
+            "role": "user",
+            "content": "Give me the information of the capital of France in the JSON format.",
+        },
+    ],
+    temperature=0,
+    max_tokens=128,
+    response_format={
+        "type": "json_schema",
+        "json_schema": {"name": "foo", "schema": json.loads(json_schema)},
+    },
+)
+
+print_highlight(response.choices[0].message.content)
+```
+
+返回大概是这样的
+
+```console
+{"name": "Paris", "population": 2147000}
+```
+
+当然，Structured Outputs远不止JSON，不过这里只是抛砖引玉罢了。
--- a/content/zh-cn/posts/llama-cpp/common-terms/svg_generator.py
+++ b/content/zh-cn/posts/llama-cpp/common-terms/svg_generator.py
@@ -0,0 +1,164 @@
+import sys
+import os
+import html
+import unicodedata
+from typing import List, Tuple
+
+def is_cjk(char):
+    """
+    检查字符是否是中日韩文字或标点符号
+    
+    东亚字符宽度标记:
+    - W: 宽字符 (如汉字)
+    - F: 全角字符
+    - A: 歧义字符 (部分中文标点)
+    """
+    east_asian_width = unicodedata.east_asian_width(char)
+    return east_asian_width in ('W', 'F', 'A')
+    
+def is_cjk_punct(char):
+    """检查字符是否是中文标点符号"""
+    # 常见中文标点符号的Unicode范围
+    cjk_punct_ranges = [
+        (0x3000, 0x303F),  # CJK标点符号
+        (0xFF00, 0xFFEF),  # 全角ASCII、全角标点
+    ]
+    
+    code = ord(char)
+    return any(start <= code <= end for start, end in cjk_punct_ranges)
+
+def calculate_width(text: str) -> int:
+    """
+    根据文本内容计算矩形宽度
+    
+    Args:
+        text: 要计算宽度的文本
+    
+    Returns:
+        像素宽度
+    """
+    if len(text) <= 1:
+        return 40
+    
+    # 计算中文字符和中文标点符号数量
+    cjk_count = sum(1 for char in text if is_cjk(char) or is_cjk_punct(char))
+    latin_count = len(text) - cjk_count
+    
+    # 根据示例近似计算宽度
+    width = cjk_count * 26 + latin_count * 9.5
+    
+    # 添加内边距
+    padding = 12
+    return width + padding
+
+def generate_svg(filename: str, texts: List[str]) -> str:
+    """
+    生成带有交替颜色圆角矩形的SVG图像
+    
+    Args:
+        filename: 保存SVG的文件名
+        texts: 要在矩形中显示的文本列表
+    
+    Returns:
+        SVG内容字符串
+    """
+    # 固定参数
+    rect_height = 40
+    y_pos = 10
+    
+    # 根据文本长度计算每个矩形的宽度
+    rect_widths = [calculate_width(text) for text in texts]
+    
+    # 初始化位置
+    positions = []
+    current_x = 10
+    
+    for width in rect_widths:
+        positions.append(current_x)
+        current_x += width + 5  # 矩形之间5px间距
+    
+    # 计算SVG总宽度
+    total_width = positions[-1] + rect_widths[-1] + 10
+    
+    # 创建SVG
+    svg = f'<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 {total_width} 60" width="{total_width}" height="60">'
+    svg += f'<path fill="none" d="M0 0h{total_width}v60H0z"/>'
+    
+    # 添加矩形和文本
+    for i, (text, x_pos, width) in enumerate(zip(texts, positions, rect_widths)):
+        # 转义文本中的特殊字符
+        escaped_text = html.escape(text)
+        
+        # 交替颜色
+        fill_color = "#ffec99" if i % 2 == 0 else "#a5d8ff"
+        
+        # 添加矩形
+        svg += f'<rect x="{x_pos}" y="{y_pos}" width="{width}" height="{rect_height}" rx="10" ry="10" '
+        svg += f'fill="{fill_color}" stroke="#000" stroke-width="1.5"/>'
+        
+        # 添加文本
+        text_x = x_pos + width / 2
+        text_y = y_pos + rect_height / 2 + 7.5  # 垂直居中（近似值）
+        svg += f'<text x="{text_x}" y="{text_y}" font-family="Arial, sans-serif" font-size="20" text-anchor="middle">{escaped_text}</text>'
+    
+    # 关闭SVG
+    svg += '</svg>'
+    
+    return svg
+
+def parse_input(input_str: str) -> Tuple[str, List[str]]:
+    """
+    解析格式为：filename|text1|text2|...|textN 的输入字符串
+    
+    Args:
+        input_str: 按要求格式的输入字符串
+    
+    Returns:
+        (filename, [text1, text2, ..., textN]) 元组
+    """
+    parts = input_str.strip().split('|')
+    if len(parts) < 2:
+        raise ValueError("输入必须至少包含一个文件名和一个文本元素，用'|'分隔")
+    
+    filename = parts[0]
+    texts = parts[1:]
+    
+    return filename, texts
+
+def main():
+    """处理输入并生成SVG的主函数"""
+    print("SVG 生成器工具")
+    print("==============")
+    print("此工具生成带有交替颜色圆角矩形的SVG图像。")
+    print("输入格式: 文件名|文本1|文本2|...|文本N")
+    print("示例: output|你好|我今天能怎么帮助你|1+1等于几|2")
+    print()
+    
+    if len(sys.argv) > 1:
+        # 从命令行参数获取输入
+        input_str = sys.argv[1]
+    else:
+        # 从标准输入获取输入
+        print("请按格式输入: 文件名|文本1|文本2|...|文本N")
+        input_str = input().strip()
+    
+    try:
+        filename, texts = parse_input(input_str)
+        svg_content = generate_svg(filename, texts)
+        
+        # 如果没有.svg扩展名，添加它
+        if not filename.lower().endswith('.svg'):
+            filename += '.svg'
+        
+        # 保存到文件
+        with open(filename, 'w', encoding='utf-8') as f:
+            f.write(svg_content)
+        
+        print(f"SVG图像已保存到 {filename}")
+        
+    except Exception as e:
+        print(f"错误: {e}")
+        print("请检查输入格式并重试。")
+
+if __name__ == "__main__":
+    main()
				`@@ -0,0 +1 @@`
				<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 455 60" width="455" height="60"><path fill="none" d="M0 0h455v60H0z"/><rect x="10" y="10" width="60" height="40" rx="10" ry="10" fill="#ffec99" stroke="#000" stroke-width="1.5"/><text x="40" y="37.5" font-family="Arial, sans-serif" font-size="20" text-anchor="middle">你好</text><rect x="75" y="10" width="200" height="40" rx="10" ry="10" fill="#a5d8ff" stroke="#000" stroke-width="1.5"/><text x="175" y="37.5" font-family="Arial, sans-serif" font-size="20" text-anchor="middle">我今天能怎么帮助你</text><rect x="280" y="10" width="120" height="40" rx="10" ry="10" fill="#ffec99" stroke="#000" stroke-width="1.5"/><text x="340" y="37.5" font-family="Arial, sans-serif" font-size="20" text-anchor="middle">1+1等于几</text><rect x="405" y="10" width="40" height="40" rx="10" ry="10" fill="#a5d8ff" stroke="#000" stroke-width="1.5"/><text x="425" y="37.5" font-family="Arial, sans-serif" font-size="20" text-anchor="middle">2</text></svg>
				`@@ -0,0 +1 @@`
				`<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 89.0 60" width="89.0" height="60"><path fill="none" d="M0 0h89.0v60H0z"/><rect x="10" y="10" width="69.0" height="40" rx="10" ry="10" fill="#ffec99" stroke="#000" stroke-width="1.5"/><text x="44.5" y="37.5" font-family="Arial, sans-serif" font-size="20" text-anchor="middle">Hello!</text></svg>`