Improve query type detection and add specialized extraction prompts for comparative queries

This commit is contained in:
Steve White 2025-03-12 11:57:40 -05:00
parent 21f75c0d25
commit c8c5240657
2 changed files with 46 additions and 21 deletions

View File

@ -121,7 +121,7 @@ class ProgressiveReportSynthesizer(ReportSynthesizer):
return prioritized_chunks return prioritized_chunks
async def extract_information_from_chunk(self, chunk: Dict[str, Any], query: str, detail_level: str = "comprehensive") -> str: async def extract_information_from_chunk(self, chunk: Dict[str, Any], query: str, detail_level: str = "comprehensive", query_type: str = "exploratory") -> str:
""" """
Extract key information from a document chunk. Extract key information from a document chunk.
@ -129,12 +129,13 @@ class ProgressiveReportSynthesizer(ReportSynthesizer):
chunk: Document chunk chunk: Document chunk
query: Original search query query: Original search query
detail_level: Level of detail for extraction detail_level: Level of detail for extraction
query_type: Type of query (factual, exploratory, comparative)
Returns: Returns:
Extracted information as a string Extracted information as a string
""" """
# Get the appropriate extraction prompt based on detail level # Get the appropriate extraction prompt based on detail level and query type
extraction_prompt = self._get_extraction_prompt(detail_level) extraction_prompt = self._get_extraction_prompt(detail_level, query_type)
# Create a prompt for extracting key information from the chunk # Create a prompt for extracting key information from the chunk
messages = [ messages = [
@ -266,7 +267,7 @@ class ProgressiveReportSynthesizer(ReportSynthesizer):
logger.info(f"Initializing report with {len(initial_chunks)} chunks") logger.info(f"Initializing report with {len(initial_chunks)} chunks")
# Process initial chunks using the standard map-reduce approach # Process initial chunks using the standard map-reduce approach
processed_chunks = await self.map_document_chunks(initial_chunks, query, detail_level) processed_chunks = await self.map_document_chunks(initial_chunks, query, detail_level, query_type)
# Generate initial report # Generate initial report
initial_report = await self.reduce_processed_chunks(processed_chunks, query, query_type, detail_level) initial_report = await self.reduce_processed_chunks(processed_chunks, query, query_type, detail_level)
@ -387,7 +388,7 @@ class ProgressiveReportSynthesizer(ReportSynthesizer):
# Extract information from chunks # Extract information from chunks
new_information = [] new_information = []
for chunk in next_batch: for chunk in next_batch:
extracted_info = await self.extract_information_from_chunk(chunk, query, detail_level) extracted_info = await self.extract_information_from_chunk(chunk, query, detail_level, query_type)
new_information.append((chunk, extracted_info)) new_information.append((chunk, extracted_info))
# Mark chunk as processed # Mark chunk as processed

View File

@ -247,7 +247,7 @@ class ReportSynthesizer:
return clean_content.strip() return clean_content.strip()
async def map_document_chunks(self, chunks: List[Dict[str, Any]], query: str, detail_level: str = "standard") -> List[Dict[str, Any]]: async def map_document_chunks(self, chunks: List[Dict[str, Any]], query: str, detail_level: str = "standard", query_type: str = "exploratory") -> List[Dict[str, Any]]:
""" """
Map phase: Process individual document chunks to extract key information. Map phase: Process individual document chunks to extract key information.
@ -255,14 +255,15 @@ class ReportSynthesizer:
chunks: List of document chunks chunks: List of document chunks
query: Original search query query: Original search query
detail_level: Level of detail for the report (brief, standard, detailed, comprehensive) detail_level: Level of detail for the report (brief, standard, detailed, comprehensive)
query_type: Type of query (factual, exploratory, comparative)
Returns: Returns:
List of processed chunks with extracted information List of processed chunks with extracted information
""" """
processed_chunks = [] processed_chunks = []
# Get the appropriate extraction prompt based on detail level # Get the appropriate extraction prompt based on detail level and query type
extraction_prompt = self._get_extraction_prompt(detail_level) extraction_prompt = self._get_extraction_prompt(detail_level, query_type)
total_chunks = len(chunks) total_chunks = len(chunks)
logger.info(f"Starting to process {total_chunks} document chunks") logger.info(f"Starting to process {total_chunks} document chunks")
@ -335,23 +336,25 @@ class ReportSynthesizer:
logger.info(f"Completed processing all {total_chunks} chunks") logger.info(f"Completed processing all {total_chunks} chunks")
return processed_chunks return processed_chunks
def _get_extraction_prompt(self, detail_level: str) -> str: def _get_extraction_prompt(self, detail_level: str, query_type: str = "exploratory") -> str:
""" """
Get the appropriate extraction prompt based on detail level. Get the appropriate extraction prompt based on detail level and query type.
Args: Args:
detail_level: Level of detail for the report (brief, standard, detailed, comprehensive) detail_level: Level of detail for the report (brief, standard, detailed, comprehensive)
query_type: Type of query (factual, exploratory, comparative)
Returns: Returns:
Extraction prompt as a string Extraction prompt as a string
""" """
# Base prompts by detail level
if detail_level.lower() in ["brief", "standard"]: if detail_level.lower() in ["brief", "standard"]:
return """You are an expert research assistant. Extract the most relevant information from this document chunk that addresses the user's query. base_prompt = """You are an expert research assistant. Extract the most relevant information from this document chunk that addresses the user's query.
Focus on factual information, key concepts, and important details. Focus on factual information, key concepts, and important details.
Include any relevant statistics, definitions, or explanations that would be valuable for a report. Include any relevant statistics, definitions, or explanations that would be valuable for a report.
Format your response as a concise summary with bullet points for key facts.""" Format your response as a concise summary with bullet points for key facts."""
elif detail_level.lower() == "detailed": elif detail_level.lower() == "detailed":
return """You are an expert research analyst with deep domain knowledge. Extract comprehensive information from this document chunk that addresses the user's query. base_prompt = """You are an expert research analyst with deep domain knowledge. Extract comprehensive information from this document chunk that addresses the user's query.
Focus on: Focus on:
- Detailed factual information and evidence - Detailed factual information and evidence
- Underlying principles and mechanisms - Underlying principles and mechanisms
@ -364,7 +367,7 @@ class ReportSynthesizer:
Prioritize depth of analysis over breadth. Extract information that provides deeper understanding rather than just basic facts. Prioritize depth of analysis over breadth. Extract information that provides deeper understanding rather than just basic facts.
Format your response with clear sections and bullet points for key insights.""" Format your response with clear sections and bullet points for key insights."""
else: # comprehensive else: # comprehensive
return """You are a world-class research analyst with exceptional analytical abilities. Extract the most comprehensive and nuanced information from this document chunk. base_prompt = """You are a world-class research analyst with exceptional analytical abilities. Extract the most comprehensive and nuanced information from this document chunk.
Focus on: Focus on:
- Multi-layered analysis of all relevant facts and evidence - Multi-layered analysis of all relevant facts and evidence
- Complex causal networks and interaction effects - Complex causal networks and interaction effects
@ -379,6 +382,25 @@ class ReportSynthesizer:
Analyze the reliability and significance of the information. Analyze the reliability and significance of the information.
Format your response with clearly organized sections and detailed bullet points.""" Format your response with clearly organized sections and detailed bullet points."""
# Add specific instructions for comparative queries
if query_type.lower() == "comparative":
comparative_instructions = """
IMPORTANT: This is a COMPARATIVE query. The user is asking to compare two or more things.
When extracting information, focus specifically on:
1. Characteristics, features, or attributes of EACH item being compared
2. Direct comparisons between the items mentioned in the query
3. Advantages and disadvantages of each item
4. Similarities and differences between the items
5. Contexts where one item might be preferred over others
Make sure to clearly identify which information relates to which item being compared.
Organize your extraction to facilitate easy comparison between the items.
"""
return base_prompt + comparative_instructions
return base_prompt
def _get_template_from_strings(self, query_type_str: str, detail_level_str: str) -> Optional[ReportTemplate]: def _get_template_from_strings(self, query_type_str: str, detail_level_str: str) -> Optional[ReportTemplate]:
""" """
Helper method to get a template using string values for query_type and detail_level. Helper method to get a template using string values for query_type and detail_level.
@ -554,13 +576,15 @@ class ReportSynthesizer:
config = detail_level_manager.get_detail_level_config(detail_level) config = detail_level_manager.get_detail_level_config(detail_level)
token_budget = config.get("token_budget", 100000) token_budget = config.get("token_budget", 100000)
# Determine query type if not specified # Determine query type based on the query text
if query_type == "exploratory": # Always try to infer the query type, regardless of what was passed in
# Try to infer query type from the query text
if any(term in query.lower() for term in ["what is", "who is", "when did", "where is", "how does"]): if any(term in query.lower() for term in ["what is", "who is", "when did", "where is", "how does"]):
query_type = "factual" query_type = "factual"
elif any(term in query.lower() for term in ["compare", "difference", "versus", "pros and cons"]): elif any(term in query.lower() for term in ["compare", "difference", "versus", "vs", "pros and cons"]):
query_type = "comparative" query_type = "comparative"
else:
# Default to exploratory if no specific pattern is detected
query_type = "exploratory"
logger.info(f"Query type determined as: {query_type}") logger.info(f"Query type determined as: {query_type}")
@ -603,7 +627,7 @@ class ReportSynthesizer:
chunk['title'] = 'Untitled' chunk['title'] = 'Untitled'
# Process this batch # Process this batch
batch_results = await self.map_document_chunks(batch, query, detail_level) batch_results = await self.map_document_chunks(batch, query, detail_level, query_type)
processed_chunks.extend(batch_results) processed_chunks.extend(batch_results)
# Add a small delay between batches to avoid rate limiting # Add a small delay between batches to avoid rate limiting