Completion
LLM inference via llama.cpp — i.e., use a language model to generate text output based on input prompts.
Usage
import {
completion,
LLAMA_3_2_1B_INST_Q4_0,
loadModel,
downloadAsset,
unloadModel,
VERBOSITY,
} from "@qvac/sdk";
try {
// First just cache the model
await downloadAsset({
assetSrc: LLAMA_3_2_1B_INST_Q4_0,
onProgress: (progress) => {
console.log(progress);
},
});
// Then load it in memory from cache
const modelId = await loadModel({
modelSrc: LLAMA_3_2_1B_INST_Q4_0,
modelType: "llm",
modelConfig: {
device: "gpu",
ctx_size: 2048,
verbosity: VERBOSITY.ERROR,
},
});
const history = [
{
role: "user",
content: "Explain quantum computing in one sentence, use lots of emojis",
},
];
const result = completion({ modelId, history, stream: true });
for await (const token of result.tokenStream) {
process.stdout.write(token);
}
const stats = await result.stats;
console.log("\n📊 Performance Stats:", stats);
// Change `clearStorage: true` to delete cached model files
await unloadModel({ modelId, clearStorage: false });
} catch (error) {
console.error("❌ Error:", error);
process.exit(1);
}