Improve query type detection and add specialized extraction prompts for comparative queries

2025-03-12 11:57:40 -05:00 · 2025-03-12 11:57:40 -05:00 · c8c5240657
parent 21f75c0d25
commit c8c5240657
2 changed files with 46 additions and 21 deletions
--- a/report/progressive_report_synthesis.py
+++ b/report/progressive_report_synthesis.py
@ -121,7 +121,7 @@ class ProgressiveReportSynthesizer(ReportSynthesizer):
        
        return prioritized_chunks
    
-    async def extract_information_from_chunk(self, chunk: Dict[str, Any], query: str, detail_level: str = "comprehensive") -> str:
+    async def extract_information_from_chunk(self, chunk: Dict[str, Any], query: str, detail_level: str = "comprehensive", query_type: str = "exploratory") -> str:
        """
        Extract key information from a document chunk.
        
@ -129,12 +129,13 @@ class ProgressiveReportSynthesizer(ReportSynthesizer):
            chunk: Document chunk
            query: Original search query
            detail_level: Level of detail for extraction
+            query_type: Type of query (factual, exploratory, comparative)
            
        Returns:
            Extracted information as a string
        """
-        # Get the appropriate extraction prompt based on detail level
-        extraction_prompt = self._get_extraction_prompt(detail_level)
+        # Get the appropriate extraction prompt based on detail level and query type
+        extraction_prompt = self._get_extraction_prompt(detail_level, query_type)
        
        # Create a prompt for extracting key information from the chunk
        messages = [
@ -266,7 +267,7 @@ class ProgressiveReportSynthesizer(ReportSynthesizer):
        logger.info(f"Initializing report with {len(initial_chunks)} chunks")
        
        # Process initial chunks using the standard map-reduce approach
-        processed_chunks = await self.map_document_chunks(initial_chunks, query, detail_level)
+        processed_chunks = await self.map_document_chunks(initial_chunks, query, detail_level, query_type)
        
        # Generate initial report
        initial_report = await self.reduce_processed_chunks(processed_chunks, query, query_type, detail_level)
@ -387,7 +388,7 @@ class ProgressiveReportSynthesizer(ReportSynthesizer):
            # Extract information from chunks
            new_information = []
            for chunk in next_batch:
-                extracted_info = await self.extract_information_from_chunk(chunk, query, detail_level)
+                extracted_info = await self.extract_information_from_chunk(chunk, query, detail_level, query_type)
                new_information.append((chunk, extracted_info))
                
                # Mark chunk as processed
--- a/report/report_synthesis.py
+++ b/report/report_synthesis.py
@ -247,7 +247,7 @@ class ReportSynthesizer:
        
        return clean_content.strip()
    
-    async def map_document_chunks(self, chunks: List[Dict[str, Any]], query: str, detail_level: str = "standard") -> List[Dict[str, Any]]:
+    async def map_document_chunks(self, chunks: List[Dict[str, Any]], query: str, detail_level: str = "standard", query_type: str = "exploratory") -> List[Dict[str, Any]]:
        """
        Map phase: Process individual document chunks to extract key information.
        
@ -255,14 +255,15 @@ class ReportSynthesizer:
            chunks: List of document chunks
            query: Original search query
            detail_level: Level of detail for the report (brief, standard, detailed, comprehensive)
+            query_type: Type of query (factual, exploratory, comparative)
            
        Returns:
            List of processed chunks with extracted information
        """
        processed_chunks = []
        
-        # Get the appropriate extraction prompt based on detail level
-        extraction_prompt = self._get_extraction_prompt(detail_level)
+        # Get the appropriate extraction prompt based on detail level and query type
+        extraction_prompt = self._get_extraction_prompt(detail_level, query_type)
        
        total_chunks = len(chunks)
        logger.info(f"Starting to process {total_chunks} document chunks")
@ -335,23 +336,25 @@ class ReportSynthesizer:
        logger.info(f"Completed processing all {total_chunks} chunks")
        return processed_chunks
    
-    def _get_extraction_prompt(self, detail_level: str) -> str:
+    def _get_extraction_prompt(self, detail_level: str, query_type: str = "exploratory") -> str:
        """
-        Get the appropriate extraction prompt based on detail level.
+        Get the appropriate extraction prompt based on detail level and query type.
        
        Args:
            detail_level: Level of detail for the report (brief, standard, detailed, comprehensive)
+            query_type: Type of query (factual, exploratory, comparative)
            
        Returns:
            Extraction prompt as a string
        """
+        # Base prompts by detail level
        if detail_level.lower() in ["brief", "standard"]:
-            return """You are an expert research assistant. Extract the most relevant information from this document chunk that addresses the user's query. 
+            base_prompt = """You are an expert research assistant. Extract the most relevant information from this document chunk that addresses the user's query. 
                Focus on factual information, key concepts, and important details. 
                Include any relevant statistics, definitions, or explanations that would be valuable for a report.
                Format your response as a concise summary with bullet points for key facts."""
        elif detail_level.lower() == "detailed":
-            return """You are an expert research analyst with deep domain knowledge. Extract comprehensive information from this document chunk that addresses the user's query.
+            base_prompt = """You are an expert research analyst with deep domain knowledge. Extract comprehensive information from this document chunk that addresses the user's query.
                Focus on:
                - Detailed factual information and evidence
                - Underlying principles and mechanisms
@ -364,7 +367,7 @@ class ReportSynthesizer:
                Prioritize depth of analysis over breadth. Extract information that provides deeper understanding rather than just basic facts.
                Format your response with clear sections and bullet points for key insights."""
        else:  # comprehensive
-            return """You are a world-class research analyst with exceptional analytical abilities. Extract the most comprehensive and nuanced information from this document chunk.
+            base_prompt = """You are a world-class research analyst with exceptional analytical abilities. Extract the most comprehensive and nuanced information from this document chunk.
                Focus on:
                - Multi-layered analysis of all relevant facts and evidence
                - Complex causal networks and interaction effects
@ -379,6 +382,25 @@ class ReportSynthesizer:
                Analyze the reliability and significance of the information.
                Format your response with clearly organized sections and detailed bullet points."""
        
+        # Add specific instructions for comparative queries
+        if query_type.lower() == "comparative":
+            comparative_instructions = """
+            IMPORTANT: This is a COMPARATIVE query. The user is asking to compare two or more things.
+            
+            When extracting information, focus specifically on:
+            1. Characteristics, features, or attributes of EACH item being compared
+            2. Direct comparisons between the items mentioned in the query
+            3. Advantages and disadvantages of each item
+            4. Similarities and differences between the items
+            5. Contexts where one item might be preferred over others
+            
+            Make sure to clearly identify which information relates to which item being compared.
+            Organize your extraction to facilitate easy comparison between the items.
+            """
+            return base_prompt + comparative_instructions
+        
+        return base_prompt
+    
    def _get_template_from_strings(self, query_type_str: str, detail_level_str: str) -> Optional[ReportTemplate]:
        """
        Helper method to get a template using string values for query_type and detail_level.
@ -554,13 +576,15 @@ class ReportSynthesizer:
        config = detail_level_manager.get_detail_level_config(detail_level)
        token_budget = config.get("token_budget", 100000)
        
-        # Determine query type if not specified
-        if query_type == "exploratory":
-            # Try to infer query type from the query text
-            if any(term in query.lower() for term in ["what is", "who is", "when did", "where is", "how does"]):
-                query_type = "factual"
-            elif any(term in query.lower() for term in ["compare", "difference", "versus", "pros and cons"]):
-                query_type = "comparative"
+        # Determine query type based on the query text
+        # Always try to infer the query type, regardless of what was passed in
+        if any(term in query.lower() for term in ["what is", "who is", "when did", "where is", "how does"]):
+            query_type = "factual"
+        elif any(term in query.lower() for term in ["compare", "difference", "versus", "vs", "pros and cons"]):
+            query_type = "comparative"
+        else:
+            # Default to exploratory if no specific pattern is detected
+            query_type = "exploratory"
        
        logger.info(f"Query type determined as: {query_type}")
        
@ -603,7 +627,7 @@ class ReportSynthesizer:
                    chunk['title'] = 'Untitled'
            
            # Process this batch
-            batch_results = await self.map_document_chunks(batch, query, detail_level)
+            batch_results = await self.map_document_chunks(batch, query, detail_level, query_type)
            processed_chunks.extend(batch_results)
            
            # Add a small delay between batches to avoid rate limiting