microsoft · paullizer · Apr 8, 2026 · Apr 8, 2026 · Apr 8, 2026
diff --git a/application/single_app/functions_content.py b/application/single_app/functions_content.py
diff --git a/application/single_app/functions_documents.py b/application/single_app/functions_documents.py
@@ -4535,16 +4535,19 @@ def process_log(document_id, user_id, temp_file_path, original_filename, enable_
 
 def process_doc(document_id, user_id, temp_file_path, original_filename, enable_enhanced_citations, update_callback, group_id=None, public_workspace_id=None):
     """
-    Processes .doc and .docm files using docx2txt library.
+    Processes legacy .doc files via OLE piece tables and .docm files via docx2txt.
     Note: .docx files still use Document Intelligence for better formatting preservation.
     """
     is_group = group_id is not None
     is_public_workspace = public_workspace_id is not None
 
     update_callback(status=f"Processing {original_filename.split('.')[-1].upper()} file...")
     total_chunks_saved = 0
+    total_embedding_tokens = 0
+    embedding_model_name = None
     chunk_config = get_chunk_size_config(get_settings())
-    target_words_per_chunk = chunk_config.get('doc', {}).get('value', 400)  # Consistent with other text-based chunking
+    file_ext = os.path.splitext(original_filename)[1].lower().lstrip('.')
+    target_words_per_chunk = chunk_config.get(file_ext, {}).get('value', 400)
 
     if enable_enhanced_citations:
         args = {
@@ -4563,15 +4566,8 @@ def process_doc(document_id, user_id, temp_file_path, original_filename, enable_
         upload_to_blob(**args)
 
     try:
-        # Import docx2txt here to avoid dependency issues if not installed
-        try:
-            import docx2txt
-        except ImportError:
-            raise Exception("docx2txt library is required for .doc and .docm file processing. Install with: pip install docx2txt")
-
-        # Extract text from .doc or .docm file
         try:
-            text_content = docx2txt.process(temp_file_path)
+            text_content = extract_word_text(temp_file_path, f'.{file_ext}')
         except Exception as e:
             raise Exception(f"Error extracting text from {original_filename}: {e}")
 
@@ -4893,15 +4889,19 @@ def process_log(document_id, user_id, temp_file_path, original_filename, enable_
 
 def process_doc(document_id, user_id, temp_file_path, original_filename, enable_enhanced_citations, update_callback, group_id=None, public_workspace_id=None):
     """
-    Processes .doc and .docm files using docx2txt library.
+    Processes legacy .doc files via OLE piece tables and .docm files via docx2txt.
     Note: .docx files still use Document Intelligence for better formatting preservation.
     """
     is_group = group_id is not None
     is_public_workspace = public_workspace_id is not None
 
     update_callback(status=f"Processing {original_filename.split('.')[-1].upper()} file...")
     total_chunks_saved = 0
-    target_words_per_chunk = 400  # Consistent with other text-based chunking
+    total_embedding_tokens = 0
+    embedding_model_name = None
+    chunk_config = get_chunk_size_config(get_settings())
+    file_ext = os.path.splitext(original_filename)[1].lower().lstrip('.')
+    target_words_per_chunk = chunk_config.get(file_ext, {}).get('value', 400)
 
     if enable_enhanced_citations:
         args = {
@@ -4920,15 +4920,8 @@ def process_doc(document_id, user_id, temp_file_path, original_filename, enable_
         upload_to_blob(**args)
 
     try:
-        # Import docx2txt here to avoid dependency issues if not installed
-        try:
-            import docx2txt
-        except ImportError:
-            raise Exception("docx2txt library is required for .doc and .docm file processing. Install with: pip install docx2txt")
-
-        # Extract text from .doc or .docm file
         try:
-            text_content = docx2txt.process(temp_file_path)
+            text_content = extract_word_text(temp_file_path, f'.{file_ext}')
         except Exception as e:
             raise Exception(f"Error extracting text from {original_filename}: {e}")
 
@@ -4969,13 +4962,18 @@ def process_doc(document_id, user_id, temp_file_path, original_filename, enable_
                 elif is_group:
                     args["group_id"] = group_id
 
-                save_chunks(**args)
+                token_usage = save_chunks(**args)
                 total_chunks_saved += 1
 
+                if token_usage:
+                    total_embedding_tokens += token_usage.get('total_tokens', 0)
+                    if not embedding_model_name:
+                        embedding_model_name = token_usage.get('model_deployment_name')
+
     except Exception as e:
         raise Exception(f"Failed processing {original_filename}: {e}")
 
-    return total_chunks_saved
+    return total_chunks_saved, total_embedding_tokens, embedding_model_name
 
 def process_html(document_id, user_id, temp_file_path, original_filename, enable_enhanced_citations, update_callback, group_id=None, public_workspace_id=None):
     """Processes HTML files."""
@@ -5851,8 +5849,10 @@ def process_di_document(document_id, user_id, temp_file_path, original_filename,
     page_count = 0 # For PDF pre-check
 
     is_pdf = file_ext == '.pdf'
-    is_word = file_ext in ('.docx', '.doc')
+    is_word = file_ext in ('.docx', '.doc', '.docm')
+    is_legacy_doc = file_ext == '.doc'
     is_ppt = file_ext in ('.pptx', '.ppt')
+    is_legacy_ppt = file_ext == '.ppt'
     is_image = file_ext in tuple('.' + ext for ext in IMAGE_EXTENSIONS)
 
     try:
@@ -5861,9 +5861,11 @@ def process_di_document(document_id, user_id, temp_file_path, original_filename,
             doc_authors_list = parse_authors(doc_author)
             page_count = get_pdf_page_count(temp_file_path)
         elif is_word:
-            doc_title, doc_author = extract_docx_metadata(temp_file_path)
+            doc_title, doc_author = extract_word_metadata(temp_file_path, file_ext)
+            doc_authors_list = parse_authors(doc_author)
+        elif is_ppt:
+            doc_title, doc_author, doc_subject, doc_keywords = extract_presentation_metadata(temp_file_path, file_ext)
             doc_authors_list = parse_authors(doc_author)
-        # PPT and Image metadata extraction might be added here if needed/possible
 
         update_fields = {'status': "Extracted initial metadata"}
         if doc_title: update_fields['title'] = doc_title
@@ -5940,27 +5942,51 @@ def process_di_document(document_id, user_id, temp_file_path, original_filename,
 
             upload_to_blob(**args)
 
-        # Send chunk to Azure DI
-        update_callback(status=f"Sending {chunk_effective_filename} to Azure Document Intelligence...")
         di_extracted_pages = []
-        try:
-            di_extracted_pages = extract_content_with_azure_di(chunk_path)
-            num_di_pages = len(di_extracted_pages)
-            conceptual_pages = num_di_pages if not is_image else 1 # Image is one conceptual item
-
-            if not di_extracted_pages and not is_image:
-                print(f"Warning: Azure DI returned no content pages for {chunk_effective_filename}.")
-                status_msg = f"Azure DI found no content in {chunk_effective_filename}."
-                # Update page count to 0 if nothing found, otherwise keep previous estimate or conceptual count
-                update_callback(number_of_pages=0 if idx == num_file_chunks else conceptual_pages, status=status_msg)
-            elif not di_extracted_pages and is_image:
-                print(f"Info: Azure DI processed image {chunk_effective_filename}, but extracted no text.")
-                update_callback(number_of_pages=conceptual_pages, status=f"Processed image {chunk_effective_filename} (no text found).")
-            else:
-                 update_callback(number_of_pages=conceptual_pages, status=f"Received {num_di_pages} content page(s)/slide(s) from Azure DI for {chunk_effective_filename}.")
+        if is_legacy_doc:
+            update_callback(status=f"Extracting legacy Word content from {chunk_effective_filename}...")
+            try:
+                extracted_text = extract_word_text(chunk_path, file_ext)
+                if extracted_text and extracted_text.strip():
+                    di_extracted_pages = [{
+                        "page_number": 1,
+                        "content": extracted_text,
+                    }]
+                    update_callback(number_of_pages=1, status=f"Extracted legacy Word content from {chunk_effective_filename}.")
+                else:
+                    print(f"Warning: Legacy Word extractor returned no content for {chunk_effective_filename}.")
+                    update_callback(number_of_pages=0, status=f"Legacy Word extractor found no content in {chunk_effective_filename}.")
+            except Exception as e:
+                raise Exception(f"Error extracting content from {chunk_effective_filename} with the legacy Word extractor: {str(e)}")
+        elif is_legacy_ppt:
+            update_callback(status=f"Extracting legacy PowerPoint content from {chunk_effective_filename}...")
+            try:
+                di_extracted_pages = extract_legacy_ppt_pages(chunk_path)
+                total_slides = len(di_extracted_pages)
+                update_callback(number_of_pages=total_slides, status=f"Extracted legacy PowerPoint content from {chunk_effective_filename}.")
+            except Exception as e:
+                raise Exception(f"Error extracting content from {chunk_effective_filename} with the legacy PowerPoint extractor: {str(e)}")
+        else:
+            # Send chunk to Azure DI
+            update_callback(status=f"Sending {chunk_effective_filename} to Azure Document Intelligence...")
+            try:
+                di_extracted_pages = extract_content_with_azure_di(chunk_path)
+                num_di_pages = len(di_extracted_pages)
+                conceptual_pages = num_di_pages if not is_image else 1 # Image is one conceptual item
+
+                if not di_extracted_pages and not is_image:
+                    print(f"Warning: Azure DI returned no content pages for {chunk_effective_filename}.")
+                    status_msg = f"Azure DI found no content in {chunk_effective_filename}."
+                    # Update page count to 0 if nothing found, otherwise keep previous estimate or conceptual count
+                    update_callback(number_of_pages=0 if idx == num_file_chunks else conceptual_pages, status=status_msg)
+                elif not di_extracted_pages and is_image:
+                    print(f"Info: Azure DI processed image {chunk_effective_filename}, but extracted no text.")
+                    update_callback(number_of_pages=conceptual_pages, status=f"Processed image {chunk_effective_filename} (no text found).")
+                else:
+                     update_callback(number_of_pages=conceptual_pages, status=f"Received {num_di_pages} content page(s)/slide(s) from Azure DI for {chunk_effective_filename}.")
 
-        except Exception as e:
-            raise Exception(f"Error extracting content from {chunk_effective_filename} with Azure DI: {str(e)}")
+            except Exception as e:
+                raise Exception(f"Error extracting content from {chunk_effective_filename} with Azure DI: {str(e)}")
 
         # --- Multi-Modal Vision Analysis (for images only) - Must happen BEFORE save_chunks ---
         if is_image and enable_enhanced_citations and idx == 1:  # Only run once for first chunk
@@ -6553,7 +6579,7 @@ def update_doc_callback(**kwargs):
         update_doc_callback(status=f"Processing file {original_filename}, type: {file_ext}")
 
         # --- 1. Dispatch to appropriate handler based on file type ---
-        # Note: .doc and .docm are handled separately by process_doc() using docx2txt
+        # Note: .doc uses the shared document pipeline with OLE extraction, while .docm stays on the direct Word-text path.
 
         is_group = group_id is not None
 
@@ -6562,7 +6588,7 @@ def update_doc_callback(**kwargs):
             "user_id": user_id,
             "temp_file_path": temp_file_path,
             "original_filename": original_filename,
-            "file_ext": file_ext if file_ext in tabular_extensions or file_ext in di_supported_extensions else None,
+            "file_ext": file_ext if file_ext in tabular_extensions or file_ext in di_supported_extensions or file_ext == '.doc' else None,
             "enable_enhanced_citations": enable_enhanced_citations,
             "update_callback": update_doc_callback
         }
@@ -6597,7 +6623,7 @@ def update_doc_callback(**kwargs):
                 total_chunks_saved, total_embedding_tokens, embedding_model_name = result
             else:
                 total_chunks_saved = result
-        elif file_ext in ('.doc', '.docm'):
+        elif file_ext == '.docm':
             result = process_doc(**{k: v for k, v in args.items() if k != "file_ext"})
             if isinstance(result, tuple) and len(result) == 3:
                 total_chunks_saved, total_embedding_tokens, embedding_model_name = result
@@ -6647,7 +6673,7 @@ def update_doc_callback(**kwargs):
                 group_id=group_id,
                 public_workspace_id=public_workspace_id
             )
-        elif file_ext in di_supported_extensions:
+        elif file_ext in di_supported_extensions or file_ext == '.doc':
             result = process_di_document(**args)
             # Handle tuple return (chunks, tokens, model_name)
             if isinstance(result, tuple) and len(result) == 3:
@@ -6722,7 +6748,11 @@ def update_doc_callback(**kwargs):
                 embedding_tokens=total_embedding_tokens,
                 embedding_model=embedding_model_name,
                 version=doc_metadata.get('version') if doc_metadata else None,
-                author=doc_metadata.get('author') if doc_metadata else None,
+                author=(
+                    doc_metadata.get('author')
+                    or ', '.join(ensure_list(doc_metadata.get('authors')))
+                    or None
+                ) if doc_metadata else None,
                 title=doc_metadata.get('title') if doc_metadata else None,
                 subject=doc_metadata.get('subject') if doc_metadata else None,
                 publication_date=doc_metadata.get('publication_date') if doc_metadata else None,

diff --git a/application/single_app/requirements.txt b/application/single_app/requirements.txt
@@ -8,6 +8,7 @@ Werkzeug==3.1.6
 requests==2.33.0
 openai==1.109.1
 docx2txt==0.8
+olefile==0.47
 Markdown==3.8.1
 bleach==6.1.0
 azure-cosmos==4.9.0

diff --git a/application/single_app/route_backend_chats.py b/application/single_app/route_backend_chats.py
@@ -7771,7 +7771,10 @@ def result_requires_message_reload(result: Any) -> bool:
                     final_api_source_refs.insert(insert_idx, 'system:default_prompt')
                     default_system_prompt_inserted = True
 
-            if not original_hybrid_search_enabled:
+            if should_apply_history_grounding_message(
+                original_hybrid_search_enabled,
+                prior_grounded_document_refs,
+            ):
                 history_grounding_message = build_history_grounding_system_message()
                 insert_idx = 0
                 if (
@@ -10214,7 +10217,10 @@ def publish_live_plugin_thought(thought_payload):
                         final_api_source_refs.insert(insert_idx, 'system:default_prompt')
                         default_system_prompt_inserted = True
 
-                if not original_hybrid_search_enabled:
+                if should_apply_history_grounding_message(
+                    original_hybrid_search_enabled,
+                    prior_grounded_document_refs,
+                ):
                     history_grounding_message = build_history_grounding_system_message()
                     insert_idx = 0
                     if (
@@ -11632,6 +11638,14 @@ def build_history_grounding_system_message():
     }
 
 
+def should_apply_history_grounding_message(
+    original_hybrid_search_enabled,
+    prior_grounded_document_refs,
+):
+    """Apply bounded grounding only when prior grounded docs exist for this conversation."""
+    return (not bool(original_hybrid_search_enabled)) and bool(prior_grounded_document_refs)
+
+
 def build_assistant_history_content_with_citations(message, content):
     base_content = str(content or '').strip()
     citation_sections = []

diff --git a/application/single_app/route_frontend_chats.py b/application/single_app/route_frontend_chats.py
@@ -538,12 +538,11 @@ def upload_file():
                         # Continue without vision analysis
 
             elif file_ext_nodot in {'doc', 'docm'}:
-                # Use docx2txt for .doc and .docm files
+                # Use OLE parsing for legacy .doc files and docx2txt for .docm files
                 try:
-                    import docx2txt
-                    extracted_content = docx2txt.process(temp_file_path)
-                except ImportError:
-                    return jsonify({'error': 'docx2txt library required for .doc/.docm files'}), 500
+                    extracted_content = extract_word_text(temp_file_path, f'.{file_ext_nodot}')
+                except Exception as e:
+                    return jsonify({'error': f'Error extracting text from {filename}: {e}'}), 500
             elif file_ext_nodot == 'txt':
                 extracted_content  = extract_text_file(temp_file_path)
             elif file_ext_nodot == 'md':