Improve query type detection and add specialized extraction prompts for comparative queries

This commit is contained in:
Steve White 2025-03-12 11:57:40 -05:00
parent 21f75c0d25
commit c8c5240657
2 changed files with 46 additions and 21 deletions

View File

@ -121,7 +121,7 @@ class ProgressiveReportSynthesizer(ReportSynthesizer):
return prioritized_chunks
async def extract_information_from_chunk(self, chunk: Dict[str, Any], query: str, detail_level: str = "comprehensive") -> str:
async def extract_information_from_chunk(self, chunk: Dict[str, Any], query: str, detail_level: str = "comprehensive", query_type: str = "exploratory") -> str:
"""
Extract key information from a document chunk.
@ -129,12 +129,13 @@ class ProgressiveReportSynthesizer(ReportSynthesizer):
chunk: Document chunk
query: Original search query
detail_level: Level of detail for extraction
query_type: Type of query (factual, exploratory, comparative)
Returns:
Extracted information as a string
"""
# Get the appropriate extraction prompt based on detail level
extraction_prompt = self._get_extraction_prompt(detail_level)
# Get the appropriate extraction prompt based on detail level and query type
extraction_prompt = self._get_extraction_prompt(detail_level, query_type)
# Create a prompt for extracting key information from the chunk
messages = [
@ -266,7 +267,7 @@ class ProgressiveReportSynthesizer(ReportSynthesizer):
logger.info(f"Initializing report with {len(initial_chunks)} chunks")
# Process initial chunks using the standard map-reduce approach
processed_chunks = await self.map_document_chunks(initial_chunks, query, detail_level)
processed_chunks = await self.map_document_chunks(initial_chunks, query, detail_level, query_type)
# Generate initial report
initial_report = await self.reduce_processed_chunks(processed_chunks, query, query_type, detail_level)
@ -387,7 +388,7 @@ class ProgressiveReportSynthesizer(ReportSynthesizer):
# Extract information from chunks
new_information = []
for chunk in next_batch:
extracted_info = await self.extract_information_from_chunk(chunk, query, detail_level)
extracted_info = await self.extract_information_from_chunk(chunk, query, detail_level, query_type)
new_information.append((chunk, extracted_info))
# Mark chunk as processed

View File

@ -247,7 +247,7 @@ class ReportSynthesizer:
return clean_content.strip()
async def map_document_chunks(self, chunks: List[Dict[str, Any]], query: str, detail_level: str = "standard") -> List[Dict[str, Any]]:
async def map_document_chunks(self, chunks: List[Dict[str, Any]], query: str, detail_level: str = "standard", query_type: str = "exploratory") -> List[Dict[str, Any]]:
"""
Map phase: Process individual document chunks to extract key information.
@ -255,14 +255,15 @@ class ReportSynthesizer:
chunks: List of document chunks
query: Original search query
detail_level: Level of detail for the report (brief, standard, detailed, comprehensive)
query_type: Type of query (factual, exploratory, comparative)
Returns:
List of processed chunks with extracted information
"""
processed_chunks = []
# Get the appropriate extraction prompt based on detail level
extraction_prompt = self._get_extraction_prompt(detail_level)
# Get the appropriate extraction prompt based on detail level and query type
extraction_prompt = self._get_extraction_prompt(detail_level, query_type)
total_chunks = len(chunks)
logger.info(f"Starting to process {total_chunks} document chunks")
@ -335,23 +336,25 @@ class ReportSynthesizer:
logger.info(f"Completed processing all {total_chunks} chunks")
return processed_chunks
def _get_extraction_prompt(self, detail_level: str) -> str:
def _get_extraction_prompt(self, detail_level: str, query_type: str = "exploratory") -> str:
"""
Get the appropriate extraction prompt based on detail level.
Get the appropriate extraction prompt based on detail level and query type.
Args:
detail_level: Level of detail for the report (brief, standard, detailed, comprehensive)
query_type: Type of query (factual, exploratory, comparative)
Returns:
Extraction prompt as a string
"""
# Base prompts by detail level
if detail_level.lower() in ["brief", "standard"]:
return """You are an expert research assistant. Extract the most relevant information from this document chunk that addresses the user's query.
base_prompt = """You are an expert research assistant. Extract the most relevant information from this document chunk that addresses the user's query.
Focus on factual information, key concepts, and important details.
Include any relevant statistics, definitions, or explanations that would be valuable for a report.
Format your response as a concise summary with bullet points for key facts."""
elif detail_level.lower() == "detailed":
return """You are an expert research analyst with deep domain knowledge. Extract comprehensive information from this document chunk that addresses the user's query.
base_prompt = """You are an expert research analyst with deep domain knowledge. Extract comprehensive information from this document chunk that addresses the user's query.
Focus on:
- Detailed factual information and evidence
- Underlying principles and mechanisms
@ -364,7 +367,7 @@ class ReportSynthesizer:
Prioritize depth of analysis over breadth. Extract information that provides deeper understanding rather than just basic facts.
Format your response with clear sections and bullet points for key insights."""
else: # comprehensive
return """You are a world-class research analyst with exceptional analytical abilities. Extract the most comprehensive and nuanced information from this document chunk.
base_prompt = """You are a world-class research analyst with exceptional analytical abilities. Extract the most comprehensive and nuanced information from this document chunk.
Focus on:
- Multi-layered analysis of all relevant facts and evidence
- Complex causal networks and interaction effects
@ -379,6 +382,25 @@ class ReportSynthesizer:
Analyze the reliability and significance of the information.
Format your response with clearly organized sections and detailed bullet points."""
# Add specific instructions for comparative queries
if query_type.lower() == "comparative":
comparative_instructions = """
IMPORTANT: This is a COMPARATIVE query. The user is asking to compare two or more things.
When extracting information, focus specifically on:
1. Characteristics, features, or attributes of EACH item being compared
2. Direct comparisons between the items mentioned in the query
3. Advantages and disadvantages of each item
4. Similarities and differences between the items
5. Contexts where one item might be preferred over others
Make sure to clearly identify which information relates to which item being compared.
Organize your extraction to facilitate easy comparison between the items.
"""
return base_prompt + comparative_instructions
return base_prompt
def _get_template_from_strings(self, query_type_str: str, detail_level_str: str) -> Optional[ReportTemplate]:
"""
Helper method to get a template using string values for query_type and detail_level.
@ -554,13 +576,15 @@ class ReportSynthesizer:
config = detail_level_manager.get_detail_level_config(detail_level)
token_budget = config.get("token_budget", 100000)
# Determine query type if not specified
if query_type == "exploratory":
# Try to infer query type from the query text
if any(term in query.lower() for term in ["what is", "who is", "when did", "where is", "how does"]):
query_type = "factual"
elif any(term in query.lower() for term in ["compare", "difference", "versus", "pros and cons"]):
query_type = "comparative"
# Determine query type based on the query text
# Always try to infer the query type, regardless of what was passed in
if any(term in query.lower() for term in ["what is", "who is", "when did", "where is", "how does"]):
query_type = "factual"
elif any(term in query.lower() for term in ["compare", "difference", "versus", "vs", "pros and cons"]):
query_type = "comparative"
else:
# Default to exploratory if no specific pattern is detected
query_type = "exploratory"
logger.info(f"Query type determined as: {query_type}")
@ -603,7 +627,7 @@ class ReportSynthesizer:
chunk['title'] = 'Untitled'
# Process this batch
batch_results = await self.map_document_chunks(batch, query, detail_level)
batch_results = await self.map_document_chunks(batch, query, detail_level, query_type)
processed_chunks.extend(batch_results)
# Add a small delay between batches to avoid rate limiting