# Stage 1: Fast vector searchcandidates = vector_db.search( query_embedding, limit=100 # Over-fetch)# Stage 2: Precise rerankingreranked = rerank( model="bge-reranker-v2", query=query, documents=[c.text for c in candidates], top_n=10)# Return top resultsreturn reranked
RAG Enhancement
Improve context quality for LLM responses:
Copy
def enhanced_rag(question): # Retrieve candidates chunks = retrieve_chunks(question, limit=20) # Rerank for relevance reranked = rerank( model="bge-reranker-v2", query=question, documents=[c.text for c in chunks], top_n=5 ) # Use best chunks as context context = "\n".join([r.document.text for r in reranked]) # Generate answer return generate_with_context(question, context)
Hybrid Search
Merge and rerank results from multiple sources:
Copy
# Get results from multiple sourceskeyword_results = keyword_search(query, limit=50)vector_results = vector_search(query, limit=50)# Merge and deduplicateall_docs = deduplicate(keyword_results + vector_results)# Unified rerankingfinal = rerank( model="bge-reranker-v2", query=query, documents=all_docs, top_n=10)
Cross-Encoder Scoring
Get precise relevance scores:
Copy
def score_relevance(query, documents): results = rerank( model="bge-reranker-v2", query=query, documents=documents ) # Use scores for filtering high_relevance = [ r for r in results if r.relevance_score > 0.7 ] return high_relevance