curl http://localhost:8090/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ "model": "llama-3.1-8b-instruct", "messages": [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "What is the capital of France?"} ], "temperature": 0.7 }'
Using OpenAI Python SDK:
from openai import OpenAIclient = OpenAI( base_url="http://localhost:8090/v1", api_key="not-needed" # No key required for local models)response = client.chat.completions.create( model="llama-3.1-8b-instruct", messages=[ {"role": "system", "content": "You are a data analyst."}, {"role": "user", "content": "Summarize this data: ..."} ])print(response.choices[0].message.content)
curl http://localhost:8090/v1/embeddings \ -H "Content-Type: application/json" \ -d '{ "model": "text-embedding-3-small", "input": "The quick brown fox jumps over the lazy dog" }'
from openai import OpenAIimport duckdbclient = OpenAI(base_url="http://localhost:8090/v1", api_key="not-needed")# Get relevant contextcon = duckdb.connect("http://localhost:8090")context_docs = con.execute(""" SELECT content, _score FROM vector_search(kb_articles, 'How do I reset my password?', 5) ORDER BY _score DESC""").fetchall()# Build context stringcontext = "\n\n".join([doc[0] for doc in context_docs])# Generate with contextresponse = client.chat.completions.create( model="gpt-4o", messages=[ {"role": "system", "content": f"Answer based on this context:\n{context}"}, {"role": "user", "content": "How do I reset my password?"} ])print(response.choices[0].message.content)
3. SQL-integrated RAG:
-- Get context in a single queryWITH context AS ( SELECT content FROM vector_search( kb_articles, 'How do I reset my password?', 5 ))SELECT string_agg(content, '\n\n') as context_textFROM context;
from openai import OpenAIclient = OpenAI(base_url="http://localhost:8090/v1", api_key="not-needed")# Get table schemaschema = """table: orderscolumns: order_id (int), customer_id (int), order_date (date), total (decimal)table: customers columns: customer_id (int), name (text), email (text)"""# Generate SQL from natural languageresponse = client.chat.completions.create( model="gpt-4o", messages=[ {"role": "system", "content": f"Generate SQL for this schema:\n{schema}"}, {"role": "user", "content": "Show me total sales by customer in the last 30 days"} ])generated_sql = response.choices[0].message.contentprint(generated_sql)# Output: SELECT c.name, SUM(o.total) as total_sales FROM orders o JOIN customers c ...
history = con.execute(""" SELECT role, content FROM chat_history WHERE session_id = ? ORDER BY timestamp LIMIT 10""", [session_id]).fetchall()messages = [{"role": row[0], "content": row[1]} for row in history]
-- Aggregate token usage by modelSELECT model, SUM(input_tokens) as total_input, SUM(output_tokens) as total_output, SUM(cost_usd) as total_costFROM runtime.llm_queriesGROUP BY model;
-- Hybrid search: semantic + keyword + filtersWITH semantic AS ( SELECT doc_id, _score FROM vector_search(docs, 'machine learning', 20)),keyword AS ( SELECT doc_id, _score FROM text_search(docs, 'ML AI neural', 20))SELECT d.*, COALESCE(s._score, 0) + COALESCE(k._score, 0) as relevanceFROM docs dLEFT JOIN semantic s ON d.doc_id = s.doc_idLEFT JOIN keyword k ON d.doc_id = k.doc_idWHERE (s.doc_id IS NOT NULL OR k.doc_id IS NOT NULL) AND d.published_date >= CURRENT_DATE - INTERVAL '1 year'ORDER BY relevance DESC;