Improve query type detection and add specialized extraction prompts for comparative queries
This commit is contained in:
parent
21f75c0d25
commit
c8c5240657
|
@ -121,7 +121,7 @@ class ProgressiveReportSynthesizer(ReportSynthesizer):
|
||||||
|
|
||||||
return prioritized_chunks
|
return prioritized_chunks
|
||||||
|
|
||||||
async def extract_information_from_chunk(self, chunk: Dict[str, Any], query: str, detail_level: str = "comprehensive") -> str:
|
async def extract_information_from_chunk(self, chunk: Dict[str, Any], query: str, detail_level: str = "comprehensive", query_type: str = "exploratory") -> str:
|
||||||
"""
|
"""
|
||||||
Extract key information from a document chunk.
|
Extract key information from a document chunk.
|
||||||
|
|
||||||
|
@ -129,12 +129,13 @@ class ProgressiveReportSynthesizer(ReportSynthesizer):
|
||||||
chunk: Document chunk
|
chunk: Document chunk
|
||||||
query: Original search query
|
query: Original search query
|
||||||
detail_level: Level of detail for extraction
|
detail_level: Level of detail for extraction
|
||||||
|
query_type: Type of query (factual, exploratory, comparative)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Extracted information as a string
|
Extracted information as a string
|
||||||
"""
|
"""
|
||||||
# Get the appropriate extraction prompt based on detail level
|
# Get the appropriate extraction prompt based on detail level and query type
|
||||||
extraction_prompt = self._get_extraction_prompt(detail_level)
|
extraction_prompt = self._get_extraction_prompt(detail_level, query_type)
|
||||||
|
|
||||||
# Create a prompt for extracting key information from the chunk
|
# Create a prompt for extracting key information from the chunk
|
||||||
messages = [
|
messages = [
|
||||||
|
@ -266,7 +267,7 @@ class ProgressiveReportSynthesizer(ReportSynthesizer):
|
||||||
logger.info(f"Initializing report with {len(initial_chunks)} chunks")
|
logger.info(f"Initializing report with {len(initial_chunks)} chunks")
|
||||||
|
|
||||||
# Process initial chunks using the standard map-reduce approach
|
# Process initial chunks using the standard map-reduce approach
|
||||||
processed_chunks = await self.map_document_chunks(initial_chunks, query, detail_level)
|
processed_chunks = await self.map_document_chunks(initial_chunks, query, detail_level, query_type)
|
||||||
|
|
||||||
# Generate initial report
|
# Generate initial report
|
||||||
initial_report = await self.reduce_processed_chunks(processed_chunks, query, query_type, detail_level)
|
initial_report = await self.reduce_processed_chunks(processed_chunks, query, query_type, detail_level)
|
||||||
|
@ -387,7 +388,7 @@ class ProgressiveReportSynthesizer(ReportSynthesizer):
|
||||||
# Extract information from chunks
|
# Extract information from chunks
|
||||||
new_information = []
|
new_information = []
|
||||||
for chunk in next_batch:
|
for chunk in next_batch:
|
||||||
extracted_info = await self.extract_information_from_chunk(chunk, query, detail_level)
|
extracted_info = await self.extract_information_from_chunk(chunk, query, detail_level, query_type)
|
||||||
new_information.append((chunk, extracted_info))
|
new_information.append((chunk, extracted_info))
|
||||||
|
|
||||||
# Mark chunk as processed
|
# Mark chunk as processed
|
||||||
|
|
|
@ -247,7 +247,7 @@ class ReportSynthesizer:
|
||||||
|
|
||||||
return clean_content.strip()
|
return clean_content.strip()
|
||||||
|
|
||||||
async def map_document_chunks(self, chunks: List[Dict[str, Any]], query: str, detail_level: str = "standard") -> List[Dict[str, Any]]:
|
async def map_document_chunks(self, chunks: List[Dict[str, Any]], query: str, detail_level: str = "standard", query_type: str = "exploratory") -> List[Dict[str, Any]]:
|
||||||
"""
|
"""
|
||||||
Map phase: Process individual document chunks to extract key information.
|
Map phase: Process individual document chunks to extract key information.
|
||||||
|
|
||||||
|
@ -255,14 +255,15 @@ class ReportSynthesizer:
|
||||||
chunks: List of document chunks
|
chunks: List of document chunks
|
||||||
query: Original search query
|
query: Original search query
|
||||||
detail_level: Level of detail for the report (brief, standard, detailed, comprehensive)
|
detail_level: Level of detail for the report (brief, standard, detailed, comprehensive)
|
||||||
|
query_type: Type of query (factual, exploratory, comparative)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of processed chunks with extracted information
|
List of processed chunks with extracted information
|
||||||
"""
|
"""
|
||||||
processed_chunks = []
|
processed_chunks = []
|
||||||
|
|
||||||
# Get the appropriate extraction prompt based on detail level
|
# Get the appropriate extraction prompt based on detail level and query type
|
||||||
extraction_prompt = self._get_extraction_prompt(detail_level)
|
extraction_prompt = self._get_extraction_prompt(detail_level, query_type)
|
||||||
|
|
||||||
total_chunks = len(chunks)
|
total_chunks = len(chunks)
|
||||||
logger.info(f"Starting to process {total_chunks} document chunks")
|
logger.info(f"Starting to process {total_chunks} document chunks")
|
||||||
|
@ -335,23 +336,25 @@ class ReportSynthesizer:
|
||||||
logger.info(f"Completed processing all {total_chunks} chunks")
|
logger.info(f"Completed processing all {total_chunks} chunks")
|
||||||
return processed_chunks
|
return processed_chunks
|
||||||
|
|
||||||
def _get_extraction_prompt(self, detail_level: str) -> str:
|
def _get_extraction_prompt(self, detail_level: str, query_type: str = "exploratory") -> str:
|
||||||
"""
|
"""
|
||||||
Get the appropriate extraction prompt based on detail level.
|
Get the appropriate extraction prompt based on detail level and query type.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
detail_level: Level of detail for the report (brief, standard, detailed, comprehensive)
|
detail_level: Level of detail for the report (brief, standard, detailed, comprehensive)
|
||||||
|
query_type: Type of query (factual, exploratory, comparative)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Extraction prompt as a string
|
Extraction prompt as a string
|
||||||
"""
|
"""
|
||||||
|
# Base prompts by detail level
|
||||||
if detail_level.lower() in ["brief", "standard"]:
|
if detail_level.lower() in ["brief", "standard"]:
|
||||||
return """You are an expert research assistant. Extract the most relevant information from this document chunk that addresses the user's query.
|
base_prompt = """You are an expert research assistant. Extract the most relevant information from this document chunk that addresses the user's query.
|
||||||
Focus on factual information, key concepts, and important details.
|
Focus on factual information, key concepts, and important details.
|
||||||
Include any relevant statistics, definitions, or explanations that would be valuable for a report.
|
Include any relevant statistics, definitions, or explanations that would be valuable for a report.
|
||||||
Format your response as a concise summary with bullet points for key facts."""
|
Format your response as a concise summary with bullet points for key facts."""
|
||||||
elif detail_level.lower() == "detailed":
|
elif detail_level.lower() == "detailed":
|
||||||
return """You are an expert research analyst with deep domain knowledge. Extract comprehensive information from this document chunk that addresses the user's query.
|
base_prompt = """You are an expert research analyst with deep domain knowledge. Extract comprehensive information from this document chunk that addresses the user's query.
|
||||||
Focus on:
|
Focus on:
|
||||||
- Detailed factual information and evidence
|
- Detailed factual information and evidence
|
||||||
- Underlying principles and mechanisms
|
- Underlying principles and mechanisms
|
||||||
|
@ -364,7 +367,7 @@ class ReportSynthesizer:
|
||||||
Prioritize depth of analysis over breadth. Extract information that provides deeper understanding rather than just basic facts.
|
Prioritize depth of analysis over breadth. Extract information that provides deeper understanding rather than just basic facts.
|
||||||
Format your response with clear sections and bullet points for key insights."""
|
Format your response with clear sections and bullet points for key insights."""
|
||||||
else: # comprehensive
|
else: # comprehensive
|
||||||
return """You are a world-class research analyst with exceptional analytical abilities. Extract the most comprehensive and nuanced information from this document chunk.
|
base_prompt = """You are a world-class research analyst with exceptional analytical abilities. Extract the most comprehensive and nuanced information from this document chunk.
|
||||||
Focus on:
|
Focus on:
|
||||||
- Multi-layered analysis of all relevant facts and evidence
|
- Multi-layered analysis of all relevant facts and evidence
|
||||||
- Complex causal networks and interaction effects
|
- Complex causal networks and interaction effects
|
||||||
|
@ -379,6 +382,25 @@ class ReportSynthesizer:
|
||||||
Analyze the reliability and significance of the information.
|
Analyze the reliability and significance of the information.
|
||||||
Format your response with clearly organized sections and detailed bullet points."""
|
Format your response with clearly organized sections and detailed bullet points."""
|
||||||
|
|
||||||
|
# Add specific instructions for comparative queries
|
||||||
|
if query_type.lower() == "comparative":
|
||||||
|
comparative_instructions = """
|
||||||
|
IMPORTANT: This is a COMPARATIVE query. The user is asking to compare two or more things.
|
||||||
|
|
||||||
|
When extracting information, focus specifically on:
|
||||||
|
1. Characteristics, features, or attributes of EACH item being compared
|
||||||
|
2. Direct comparisons between the items mentioned in the query
|
||||||
|
3. Advantages and disadvantages of each item
|
||||||
|
4. Similarities and differences between the items
|
||||||
|
5. Contexts where one item might be preferred over others
|
||||||
|
|
||||||
|
Make sure to clearly identify which information relates to which item being compared.
|
||||||
|
Organize your extraction to facilitate easy comparison between the items.
|
||||||
|
"""
|
||||||
|
return base_prompt + comparative_instructions
|
||||||
|
|
||||||
|
return base_prompt
|
||||||
|
|
||||||
def _get_template_from_strings(self, query_type_str: str, detail_level_str: str) -> Optional[ReportTemplate]:
|
def _get_template_from_strings(self, query_type_str: str, detail_level_str: str) -> Optional[ReportTemplate]:
|
||||||
"""
|
"""
|
||||||
Helper method to get a template using string values for query_type and detail_level.
|
Helper method to get a template using string values for query_type and detail_level.
|
||||||
|
@ -554,13 +576,15 @@ class ReportSynthesizer:
|
||||||
config = detail_level_manager.get_detail_level_config(detail_level)
|
config = detail_level_manager.get_detail_level_config(detail_level)
|
||||||
token_budget = config.get("token_budget", 100000)
|
token_budget = config.get("token_budget", 100000)
|
||||||
|
|
||||||
# Determine query type if not specified
|
# Determine query type based on the query text
|
||||||
if query_type == "exploratory":
|
# Always try to infer the query type, regardless of what was passed in
|
||||||
# Try to infer query type from the query text
|
|
||||||
if any(term in query.lower() for term in ["what is", "who is", "when did", "where is", "how does"]):
|
if any(term in query.lower() for term in ["what is", "who is", "when did", "where is", "how does"]):
|
||||||
query_type = "factual"
|
query_type = "factual"
|
||||||
elif any(term in query.lower() for term in ["compare", "difference", "versus", "pros and cons"]):
|
elif any(term in query.lower() for term in ["compare", "difference", "versus", "vs", "pros and cons"]):
|
||||||
query_type = "comparative"
|
query_type = "comparative"
|
||||||
|
else:
|
||||||
|
# Default to exploratory if no specific pattern is detected
|
||||||
|
query_type = "exploratory"
|
||||||
|
|
||||||
logger.info(f"Query type determined as: {query_type}")
|
logger.info(f"Query type determined as: {query_type}")
|
||||||
|
|
||||||
|
@ -603,7 +627,7 @@ class ReportSynthesizer:
|
||||||
chunk['title'] = 'Untitled'
|
chunk['title'] = 'Untitled'
|
||||||
|
|
||||||
# Process this batch
|
# Process this batch
|
||||||
batch_results = await self.map_document_chunks(batch, query, detail_level)
|
batch_results = await self.map_document_chunks(batch, query, detail_level, query_type)
|
||||||
processed_chunks.extend(batch_results)
|
processed_chunks.extend(batch_results)
|
||||||
|
|
||||||
# Add a small delay between batches to avoid rate limiting
|
# Add a small delay between batches to avoid rate limiting
|
||||||
|
|
Loading…
Reference in New Issue