Improve query type detection and add specialized extraction prompts for comparative queries
This commit is contained in:
parent
21f75c0d25
commit
c8c5240657
|
@ -121,7 +121,7 @@ class ProgressiveReportSynthesizer(ReportSynthesizer):
|
|||
|
||||
return prioritized_chunks
|
||||
|
||||
async def extract_information_from_chunk(self, chunk: Dict[str, Any], query: str, detail_level: str = "comprehensive") -> str:
|
||||
async def extract_information_from_chunk(self, chunk: Dict[str, Any], query: str, detail_level: str = "comprehensive", query_type: str = "exploratory") -> str:
|
||||
"""
|
||||
Extract key information from a document chunk.
|
||||
|
||||
|
@ -129,12 +129,13 @@ class ProgressiveReportSynthesizer(ReportSynthesizer):
|
|||
chunk: Document chunk
|
||||
query: Original search query
|
||||
detail_level: Level of detail for extraction
|
||||
query_type: Type of query (factual, exploratory, comparative)
|
||||
|
||||
Returns:
|
||||
Extracted information as a string
|
||||
"""
|
||||
# Get the appropriate extraction prompt based on detail level
|
||||
extraction_prompt = self._get_extraction_prompt(detail_level)
|
||||
# Get the appropriate extraction prompt based on detail level and query type
|
||||
extraction_prompt = self._get_extraction_prompt(detail_level, query_type)
|
||||
|
||||
# Create a prompt for extracting key information from the chunk
|
||||
messages = [
|
||||
|
@ -266,7 +267,7 @@ class ProgressiveReportSynthesizer(ReportSynthesizer):
|
|||
logger.info(f"Initializing report with {len(initial_chunks)} chunks")
|
||||
|
||||
# Process initial chunks using the standard map-reduce approach
|
||||
processed_chunks = await self.map_document_chunks(initial_chunks, query, detail_level)
|
||||
processed_chunks = await self.map_document_chunks(initial_chunks, query, detail_level, query_type)
|
||||
|
||||
# Generate initial report
|
||||
initial_report = await self.reduce_processed_chunks(processed_chunks, query, query_type, detail_level)
|
||||
|
@ -387,7 +388,7 @@ class ProgressiveReportSynthesizer(ReportSynthesizer):
|
|||
# Extract information from chunks
|
||||
new_information = []
|
||||
for chunk in next_batch:
|
||||
extracted_info = await self.extract_information_from_chunk(chunk, query, detail_level)
|
||||
extracted_info = await self.extract_information_from_chunk(chunk, query, detail_level, query_type)
|
||||
new_information.append((chunk, extracted_info))
|
||||
|
||||
# Mark chunk as processed
|
||||
|
|
|
@ -247,7 +247,7 @@ class ReportSynthesizer:
|
|||
|
||||
return clean_content.strip()
|
||||
|
||||
async def map_document_chunks(self, chunks: List[Dict[str, Any]], query: str, detail_level: str = "standard") -> List[Dict[str, Any]]:
|
||||
async def map_document_chunks(self, chunks: List[Dict[str, Any]], query: str, detail_level: str = "standard", query_type: str = "exploratory") -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Map phase: Process individual document chunks to extract key information.
|
||||
|
||||
|
@ -255,14 +255,15 @@ class ReportSynthesizer:
|
|||
chunks: List of document chunks
|
||||
query: Original search query
|
||||
detail_level: Level of detail for the report (brief, standard, detailed, comprehensive)
|
||||
query_type: Type of query (factual, exploratory, comparative)
|
||||
|
||||
Returns:
|
||||
List of processed chunks with extracted information
|
||||
"""
|
||||
processed_chunks = []
|
||||
|
||||
# Get the appropriate extraction prompt based on detail level
|
||||
extraction_prompt = self._get_extraction_prompt(detail_level)
|
||||
# Get the appropriate extraction prompt based on detail level and query type
|
||||
extraction_prompt = self._get_extraction_prompt(detail_level, query_type)
|
||||
|
||||
total_chunks = len(chunks)
|
||||
logger.info(f"Starting to process {total_chunks} document chunks")
|
||||
|
@ -335,23 +336,25 @@ class ReportSynthesizer:
|
|||
logger.info(f"Completed processing all {total_chunks} chunks")
|
||||
return processed_chunks
|
||||
|
||||
def _get_extraction_prompt(self, detail_level: str) -> str:
|
||||
def _get_extraction_prompt(self, detail_level: str, query_type: str = "exploratory") -> str:
|
||||
"""
|
||||
Get the appropriate extraction prompt based on detail level.
|
||||
Get the appropriate extraction prompt based on detail level and query type.
|
||||
|
||||
Args:
|
||||
detail_level: Level of detail for the report (brief, standard, detailed, comprehensive)
|
||||
query_type: Type of query (factual, exploratory, comparative)
|
||||
|
||||
Returns:
|
||||
Extraction prompt as a string
|
||||
"""
|
||||
# Base prompts by detail level
|
||||
if detail_level.lower() in ["brief", "standard"]:
|
||||
return """You are an expert research assistant. Extract the most relevant information from this document chunk that addresses the user's query.
|
||||
base_prompt = """You are an expert research assistant. Extract the most relevant information from this document chunk that addresses the user's query.
|
||||
Focus on factual information, key concepts, and important details.
|
||||
Include any relevant statistics, definitions, or explanations that would be valuable for a report.
|
||||
Format your response as a concise summary with bullet points for key facts."""
|
||||
elif detail_level.lower() == "detailed":
|
||||
return """You are an expert research analyst with deep domain knowledge. Extract comprehensive information from this document chunk that addresses the user's query.
|
||||
base_prompt = """You are an expert research analyst with deep domain knowledge. Extract comprehensive information from this document chunk that addresses the user's query.
|
||||
Focus on:
|
||||
- Detailed factual information and evidence
|
||||
- Underlying principles and mechanisms
|
||||
|
@ -364,7 +367,7 @@ class ReportSynthesizer:
|
|||
Prioritize depth of analysis over breadth. Extract information that provides deeper understanding rather than just basic facts.
|
||||
Format your response with clear sections and bullet points for key insights."""
|
||||
else: # comprehensive
|
||||
return """You are a world-class research analyst with exceptional analytical abilities. Extract the most comprehensive and nuanced information from this document chunk.
|
||||
base_prompt = """You are a world-class research analyst with exceptional analytical abilities. Extract the most comprehensive and nuanced information from this document chunk.
|
||||
Focus on:
|
||||
- Multi-layered analysis of all relevant facts and evidence
|
||||
- Complex causal networks and interaction effects
|
||||
|
@ -379,6 +382,25 @@ class ReportSynthesizer:
|
|||
Analyze the reliability and significance of the information.
|
||||
Format your response with clearly organized sections and detailed bullet points."""
|
||||
|
||||
# Add specific instructions for comparative queries
|
||||
if query_type.lower() == "comparative":
|
||||
comparative_instructions = """
|
||||
IMPORTANT: This is a COMPARATIVE query. The user is asking to compare two or more things.
|
||||
|
||||
When extracting information, focus specifically on:
|
||||
1. Characteristics, features, or attributes of EACH item being compared
|
||||
2. Direct comparisons between the items mentioned in the query
|
||||
3. Advantages and disadvantages of each item
|
||||
4. Similarities and differences between the items
|
||||
5. Contexts where one item might be preferred over others
|
||||
|
||||
Make sure to clearly identify which information relates to which item being compared.
|
||||
Organize your extraction to facilitate easy comparison between the items.
|
||||
"""
|
||||
return base_prompt + comparative_instructions
|
||||
|
||||
return base_prompt
|
||||
|
||||
def _get_template_from_strings(self, query_type_str: str, detail_level_str: str) -> Optional[ReportTemplate]:
|
||||
"""
|
||||
Helper method to get a template using string values for query_type and detail_level.
|
||||
|
@ -554,13 +576,15 @@ class ReportSynthesizer:
|
|||
config = detail_level_manager.get_detail_level_config(detail_level)
|
||||
token_budget = config.get("token_budget", 100000)
|
||||
|
||||
# Determine query type if not specified
|
||||
if query_type == "exploratory":
|
||||
# Try to infer query type from the query text
|
||||
if any(term in query.lower() for term in ["what is", "who is", "when did", "where is", "how does"]):
|
||||
query_type = "factual"
|
||||
elif any(term in query.lower() for term in ["compare", "difference", "versus", "pros and cons"]):
|
||||
query_type = "comparative"
|
||||
# Determine query type based on the query text
|
||||
# Always try to infer the query type, regardless of what was passed in
|
||||
if any(term in query.lower() for term in ["what is", "who is", "when did", "where is", "how does"]):
|
||||
query_type = "factual"
|
||||
elif any(term in query.lower() for term in ["compare", "difference", "versus", "vs", "pros and cons"]):
|
||||
query_type = "comparative"
|
||||
else:
|
||||
# Default to exploratory if no specific pattern is detected
|
||||
query_type = "exploratory"
|
||||
|
||||
logger.info(f"Query type determined as: {query_type}")
|
||||
|
||||
|
@ -603,7 +627,7 @@ class ReportSynthesizer:
|
|||
chunk['title'] = 'Untitled'
|
||||
|
||||
# Process this batch
|
||||
batch_results = await self.map_document_chunks(batch, query, detail_level)
|
||||
batch_results = await self.map_document_chunks(batch, query, detail_level, query_type)
|
||||
processed_chunks.extend(batch_results)
|
||||
|
||||
# Add a small delay between batches to avoid rate limiting
|
||||
|
|
Loading…
Reference in New Issue