diff --git a/application/single_app/functions_content.py b/application/single_app/functions_content.py index 4a9576e5..98eb0ea1 100644 --- a/application/single_app/functions_content.py +++ b/application/single_app/functions_content.py @@ -1,6 +1,11 @@ # functions_content.py import email.utils +import struct +import zipfile +from xml.etree import ElementTree + +import olefile from functions_debug import debug_print from config import * @@ -15,6 +20,193 @@ def extract_markdown_file(file_path): with open(file_path, 'r', encoding='utf-8') as f: return f.read() + +def extract_docx_text(file_path): + """Extract text from OOXML Word documents such as .docx and .docm.""" + try: + import docx2txt + except ImportError as exc: + raise Exception( + "docx2txt library is required for .docx/.docm file processing. Install with: pip install docx2txt" + ) from exc + + return docx2txt.process(file_path) + + +def _normalize_legacy_doc_text(text): + """Convert Word control characters into readable plain text.""" + if not text: + return "" + + field_stripped_text = [] + field_stack = [] + + for character in text: + if character == "\x13": + field_stack.append("code") + continue + if character == "\x14": + if field_stack: + field_stack[-1] = "result" + continue + if character == "\x15": + if field_stack: + field_stack.pop() + continue + + if not field_stack or field_stack[-1] == "result": + field_stripped_text.append(character) + + normalized_text = ( + "".join(field_stripped_text) + .replace("\r", "\n") + .replace("\x0b", "\n") + .replace("\x0c", "\n\n") + .replace("\x07", "\t") + .replace("\x00", "") + ) + normalized_text = re.sub(r"[\x01-\x08\x0e-\x1f]", " ", normalized_text) + normalized_text = re.sub(r"\n{3,}", "\n\n", normalized_text) + normalized_text = re.sub(r"[ \t]{2,}", " ", normalized_text) + return normalized_text.strip() + + +def _score_legacy_doc_candidate(text): + """Prefer longer candidates with a high ratio of readable characters.""" + if not text: + return 0 + + readable_characters = sum( + 1 + for character in text + if character.isalnum() + or character.isspace() + or character in ".,;:!?()[]{}'\"-_/@#$%^&*+=<>|" + ) + return readable_characters + + +def _extract_legacy_doc_text_from_piece_table(word_stream, piece_table_bytes): + """Parse a PlcPcd piece table from the WordDocument stream.""" + if len(piece_table_bytes) < 16 or (len(piece_table_bytes) - 4) % 12 != 0: + return "" + + piece_count = (len(piece_table_bytes) - 4) // 12 + cp_count = piece_count + 1 + cp_byte_count = cp_count * 4 + + if len(piece_table_bytes) != cp_byte_count + (piece_count * 8): + return "" + + character_positions = struct.unpack(f"<{cp_count}I", piece_table_bytes[:cp_byte_count]) + if any(character_positions[index] > character_positions[index + 1] for index in range(piece_count)): + return "" + + text_segments = [] + piece_descriptor_offset = cp_byte_count + + for index in range(piece_count): + start_cp = character_positions[index] + end_cp = character_positions[index + 1] + character_count = end_cp - start_cp + if character_count < 0: + return "" + + piece_descriptor_start = piece_descriptor_offset + (index * 8) + piece_descriptor_end = piece_descriptor_start + 8 + piece_descriptor = piece_table_bytes[piece_descriptor_start:piece_descriptor_end] + if len(piece_descriptor) != 8: + return "" + + fc_compressed = struct.unpack(" len(word_stream): + return "" + + raw_text = word_stream[stream_offset:stream_offset + byte_count] + text_segments.append(raw_text.decode(encoding, errors='ignore')) + + return _normalize_legacy_doc_text("".join(text_segments)) + + +def _extract_legacy_doc_text_from_table_stream(word_stream, table_stream): + """Scan a Word table stream for the most plausible text piece table.""" + best_text = "" + best_score = 0 + search_offset = 0 + + while search_offset <= len(table_stream) - 5: + piece_table_marker_offset = table_stream.find(b"\x02", search_offset) + if piece_table_marker_offset == -1 or piece_table_marker_offset > len(table_stream) - 5: + break + + piece_table_length = struct.unpack( + "= 16 + and (piece_table_length - 4) % 12 == 0 + and piece_table_end <= len(table_stream) + ): + candidate_text = _extract_legacy_doc_text_from_piece_table( + word_stream, + table_stream[piece_table_marker_offset + 5:piece_table_end], + ) + candidate_score = _score_legacy_doc_candidate(candidate_text) + if candidate_score > best_score: + best_text = candidate_text + best_score = candidate_score + + search_offset = piece_table_marker_offset + 1 + + return best_text + + +def extract_legacy_doc_text(file_path): + """Extract text from Word 97-2003 .doc files using OLE streams and piece tables.""" + if not olefile.isOleFile(file_path): + raise Exception("File is not a valid OLE compound document") + + ole = olefile.OleFileIO(file_path) + try: + if not ole.exists("WordDocument"): + raise Exception("Missing WordDocument stream") + + word_stream = ole.openstream("WordDocument").read() + best_text = "" + best_score = 0 + + for table_stream_name in ("1Table", "0Table"): + if not ole.exists(table_stream_name): + continue + + table_stream = ole.openstream(table_stream_name).read() + candidate_text = _extract_legacy_doc_text_from_table_stream(word_stream, table_stream) + candidate_score = _score_legacy_doc_candidate(candidate_text) + if candidate_score > best_score: + best_text = candidate_text + best_score = candidate_score + + if not best_text: + raise Exception("Could not locate a readable text piece table in the document") + + return best_text + finally: + ole.close() + def extract_content_with_azure_di(file_path): """ Extracts text page-by-page using Azure Document Intelligence "prebuilt-read" @@ -218,6 +410,245 @@ def extract_docx_metadata(docx_path): print(f"Error extracting DOCX metadata: {e}") return '', '' + +def _normalize_legacy_doc_metadata_value(value): + """Convert OLE metadata values into trimmed strings.""" + if value is None: + return '' + + if isinstance(value, bytes): + for encoding in ('utf-8', 'utf-16le', 'cp1252', 'latin1'): + try: + value = value.decode(encoding) + break + except Exception: + continue + else: + value = value.decode('utf-8', errors='ignore') + + return str(value).strip().strip('\x00').strip() + + +def _parse_metadata_keywords(value): + """Parse metadata keywords into a normalized list of values.""" + normalized_value = _normalize_legacy_doc_metadata_value(value) + if not normalized_value: + return [] + + return [keyword.strip() for keyword in re.split(r'[;,]', normalized_value) if keyword.strip()] + + +def extract_legacy_doc_metadata(doc_path): + """Return title and author from a legacy OLE Word document when available.""" + try: + if not olefile.isOleFile(doc_path): + return '', '' + + ole = olefile.OleFileIO(doc_path) + try: + metadata = ole.get_metadata() + doc_title = _normalize_legacy_doc_metadata_value(getattr(metadata, 'title', '')) + doc_author = _normalize_legacy_doc_metadata_value(getattr(metadata, 'author', '')) + + if not doc_author: + doc_author = _normalize_legacy_doc_metadata_value(getattr(metadata, 'last_saved_by', '')) + + return doc_title, doc_author + finally: + ole.close() + except Exception as e: + print(f"Error extracting DOC metadata: {e}") + return '', '' + + +def extract_pptx_metadata(pptx_path): + """Return title, author, subject, and keywords from an OOXML PowerPoint file.""" + namespaces = { + 'cp': 'http://schemas.openxmlformats.org/package/2006/metadata/core-properties', + 'dc': 'http://purl.org/dc/elements/1.1/', + } + + try: + with zipfile.ZipFile(pptx_path) as archive: + try: + core_properties = archive.read('docProps/core.xml') + except KeyError: + return '', '', '', [] + + root = ElementTree.fromstring(core_properties) + ppt_title = (root.findtext('dc:title', default='', namespaces=namespaces) or '').strip() + ppt_author = (root.findtext('dc:creator', default='', namespaces=namespaces) or '').strip() + ppt_subject = (root.findtext('dc:subject', default='', namespaces=namespaces) or '').strip() + ppt_keywords = _parse_metadata_keywords( + root.findtext('cp:keywords', default='', namespaces=namespaces) or '' + ) + return ppt_title, ppt_author, ppt_subject, ppt_keywords + except Exception as e: + print(f"Error extracting PPTX metadata: {e}") + return '', '', '', [] + + +def _clean_legacy_ppt_text_fragment(text): + """Normalize legacy PowerPoint text atoms into readable slide text.""" + if not text: + return '' + + normalized_text = ( + text + .replace('\r', '\n') + .replace('\x0b', '\n') + .replace('\x0c', '\n') + .replace('\x00', '') + ) + normalized_text = re.sub(r'[\x01-\x08\x0e-\x1f]', ' ', normalized_text) + normalized_text = re.sub(r'[ \t]{2,}', ' ', normalized_text) + normalized_text = re.sub(r'\n{3,}', '\n\n', normalized_text) + return normalized_text.strip() + + +def extract_legacy_ppt_pages(file_path): + """Extract slide text from a legacy OLE PowerPoint .ppt file.""" + if not olefile.isOleFile(file_path): + raise Exception("File is not a valid OLE compound document") + + ole = olefile.OleFileIO(file_path) + try: + if not ole.exists('PowerPoint Document'): + raise Exception("Missing PowerPoint Document stream") + + document_stream = ole.openstream('PowerPoint Document').read() + finally: + ole.close() + + slide_fragments = {} + slide_counter = 0 + + def walk_records(start_offset, end_offset, current_slide_number=None): + nonlocal slide_counter + + offset = start_offset + while offset + 8 <= end_offset: + record_header = struct.unpack_from(' end_offset: + return + + next_slide_number = current_slide_number + if record_type == 1006: + slide_counter += 1 + next_slide_number = slide_counter + slide_fragments.setdefault(next_slide_number, []) + + if record_type in {4000, 4008} and next_slide_number is not None: + if record_type == 4000: + raw_text = document_stream[payload_start:payload_end].decode('utf-16le', errors='ignore') + else: + raw_text = document_stream[payload_start:payload_end].decode('cp1252', errors='ignore') + + cleaned_text = _clean_legacy_ppt_text_fragment(raw_text) + if cleaned_text: + fragments = slide_fragments.setdefault(next_slide_number, []) + if not fragments or fragments[-1] != cleaned_text: + fragments.append(cleaned_text) + + if record_version == 0x0F: + walk_records(payload_start, payload_end, next_slide_number) + + offset = payload_end + + walk_records(0, len(document_stream)) + + pages = [] + non_empty_slide_count = 0 + for slide_number in range(1, slide_counter + 1): + slide_text = "\n".join(slide_fragments.get(slide_number, [])) + slide_text = re.sub(r'\n{3,}', '\n\n', slide_text).strip() + if slide_text: + non_empty_slide_count += 1 + + pages.append({ + 'page_number': slide_number, + 'content': slide_text, + }) + + if non_empty_slide_count == 0: + raise Exception("Could not locate readable slide text in the presentation") + + return pages + + +def extract_legacy_ppt_metadata(ppt_path): + """Return title, author, subject, and keywords from a legacy OLE PowerPoint file.""" + try: + if not olefile.isOleFile(ppt_path): + return '', '', '', [] + + ole = olefile.OleFileIO(ppt_path) + try: + metadata = ole.get_metadata() + ppt_title = _normalize_legacy_doc_metadata_value(getattr(metadata, 'title', '')) + ppt_author = _normalize_legacy_doc_metadata_value(getattr(metadata, 'author', '')) + ppt_subject = _normalize_legacy_doc_metadata_value(getattr(metadata, 'subject', '')) + ppt_keywords = _parse_metadata_keywords(getattr(metadata, 'keywords', '')) + + if not ppt_author: + ppt_author = _normalize_legacy_doc_metadata_value(getattr(metadata, 'last_saved_by', '')) + + return ppt_title, ppt_author, ppt_subject, ppt_keywords + finally: + ole.close() + except Exception as e: + print(f"Error extracting PPT metadata: {e}") + return '', '', '', [] + + +def extract_presentation_metadata(file_path, file_extension=None): + """Extract metadata from supported PowerPoint presentation formats.""" + resolved_extension = (file_extension or os.path.splitext(file_path)[1]).lower() + + if resolved_extension == '.ppt': + return extract_legacy_ppt_metadata(file_path) + + if resolved_extension == '.pptx': + return extract_pptx_metadata(file_path) + + return '', '', '', [] + + +def extract_word_text(file_path, file_extension=None): + """Extract text from supported Word document formats.""" + resolved_extension = (file_extension or os.path.splitext(file_path)[1]).lower() + + if resolved_extension == '.doc': + if olefile.isOleFile(file_path): + return extract_legacy_doc_text(file_path) + return extract_docx_text(file_path) + + if resolved_extension in {'.docx', '.docm'}: + return extract_docx_text(file_path) + + raise ValueError(f"Unsupported Word document extension: {resolved_extension}") + + +def extract_word_metadata(file_path, file_extension=None): + """Extract title and author metadata from supported Word document formats.""" + resolved_extension = (file_extension or os.path.splitext(file_path)[1]).lower() + + if resolved_extension == '.doc': + if olefile.isOleFile(file_path): + return extract_legacy_doc_metadata(file_path) + return extract_docx_metadata(file_path) + + if resolved_extension in {'.docx', '.docm'}: + return extract_docx_metadata(file_path) + + return '', '' + def parse_authors(author_input): """ Converts any input (None, string, list, comma-delimited, etc.) diff --git a/application/single_app/functions_documents.py b/application/single_app/functions_documents.py index 2ff2fc95..7bff48d8 100644 --- a/application/single_app/functions_documents.py +++ b/application/single_app/functions_documents.py @@ -4535,7 +4535,7 @@ def process_log(document_id, user_id, temp_file_path, original_filename, enable_ def process_doc(document_id, user_id, temp_file_path, original_filename, enable_enhanced_citations, update_callback, group_id=None, public_workspace_id=None): """ - Processes .doc and .docm files using docx2txt library. + Processes legacy .doc files via OLE piece tables and .docm files via docx2txt. Note: .docx files still use Document Intelligence for better formatting preservation. """ is_group = group_id is not None @@ -4543,8 +4543,11 @@ def process_doc(document_id, user_id, temp_file_path, original_filename, enable_ update_callback(status=f"Processing {original_filename.split('.')[-1].upper()} file...") total_chunks_saved = 0 + total_embedding_tokens = 0 + embedding_model_name = None chunk_config = get_chunk_size_config(get_settings()) - target_words_per_chunk = chunk_config.get('doc', {}).get('value', 400) # Consistent with other text-based chunking + file_ext = os.path.splitext(original_filename)[1].lower().lstrip('.') + target_words_per_chunk = chunk_config.get(file_ext, {}).get('value', 400) if enable_enhanced_citations: args = { @@ -4563,15 +4566,8 @@ def process_doc(document_id, user_id, temp_file_path, original_filename, enable_ upload_to_blob(**args) try: - # Import docx2txt here to avoid dependency issues if not installed - try: - import docx2txt - except ImportError: - raise Exception("docx2txt library is required for .doc and .docm file processing. Install with: pip install docx2txt") - - # Extract text from .doc or .docm file try: - text_content = docx2txt.process(temp_file_path) + text_content = extract_word_text(temp_file_path, f'.{file_ext}') except Exception as e: raise Exception(f"Error extracting text from {original_filename}: {e}") @@ -4893,7 +4889,7 @@ def process_log(document_id, user_id, temp_file_path, original_filename, enable_ def process_doc(document_id, user_id, temp_file_path, original_filename, enable_enhanced_citations, update_callback, group_id=None, public_workspace_id=None): """ - Processes .doc and .docm files using docx2txt library. + Processes legacy .doc files via OLE piece tables and .docm files via docx2txt. Note: .docx files still use Document Intelligence for better formatting preservation. """ is_group = group_id is not None @@ -4901,7 +4897,11 @@ def process_doc(document_id, user_id, temp_file_path, original_filename, enable_ update_callback(status=f"Processing {original_filename.split('.')[-1].upper()} file...") total_chunks_saved = 0 - target_words_per_chunk = 400 # Consistent with other text-based chunking + total_embedding_tokens = 0 + embedding_model_name = None + chunk_config = get_chunk_size_config(get_settings()) + file_ext = os.path.splitext(original_filename)[1].lower().lstrip('.') + target_words_per_chunk = chunk_config.get(file_ext, {}).get('value', 400) if enable_enhanced_citations: args = { @@ -4920,15 +4920,8 @@ def process_doc(document_id, user_id, temp_file_path, original_filename, enable_ upload_to_blob(**args) try: - # Import docx2txt here to avoid dependency issues if not installed - try: - import docx2txt - except ImportError: - raise Exception("docx2txt library is required for .doc and .docm file processing. Install with: pip install docx2txt") - - # Extract text from .doc or .docm file try: - text_content = docx2txt.process(temp_file_path) + text_content = extract_word_text(temp_file_path, f'.{file_ext}') except Exception as e: raise Exception(f"Error extracting text from {original_filename}: {e}") @@ -4969,13 +4962,18 @@ def process_doc(document_id, user_id, temp_file_path, original_filename, enable_ elif is_group: args["group_id"] = group_id - save_chunks(**args) + token_usage = save_chunks(**args) total_chunks_saved += 1 + if token_usage: + total_embedding_tokens += token_usage.get('total_tokens', 0) + if not embedding_model_name: + embedding_model_name = token_usage.get('model_deployment_name') + except Exception as e: raise Exception(f"Failed processing {original_filename}: {e}") - return total_chunks_saved + return total_chunks_saved, total_embedding_tokens, embedding_model_name def process_html(document_id, user_id, temp_file_path, original_filename, enable_enhanced_citations, update_callback, group_id=None, public_workspace_id=None): """Processes HTML files.""" @@ -5851,8 +5849,10 @@ def process_di_document(document_id, user_id, temp_file_path, original_filename, page_count = 0 # For PDF pre-check is_pdf = file_ext == '.pdf' - is_word = file_ext in ('.docx', '.doc') + is_word = file_ext in ('.docx', '.doc', '.docm') + is_legacy_doc = file_ext == '.doc' is_ppt = file_ext in ('.pptx', '.ppt') + is_legacy_ppt = file_ext == '.ppt' is_image = file_ext in tuple('.' + ext for ext in IMAGE_EXTENSIONS) try: @@ -5861,9 +5861,11 @@ def process_di_document(document_id, user_id, temp_file_path, original_filename, doc_authors_list = parse_authors(doc_author) page_count = get_pdf_page_count(temp_file_path) elif is_word: - doc_title, doc_author = extract_docx_metadata(temp_file_path) + doc_title, doc_author = extract_word_metadata(temp_file_path, file_ext) + doc_authors_list = parse_authors(doc_author) + elif is_ppt: + doc_title, doc_author, doc_subject, doc_keywords = extract_presentation_metadata(temp_file_path, file_ext) doc_authors_list = parse_authors(doc_author) - # PPT and Image metadata extraction might be added here if needed/possible update_fields = {'status': "Extracted initial metadata"} if doc_title: update_fields['title'] = doc_title @@ -5940,27 +5942,51 @@ def process_di_document(document_id, user_id, temp_file_path, original_filename, upload_to_blob(**args) - # Send chunk to Azure DI - update_callback(status=f"Sending {chunk_effective_filename} to Azure Document Intelligence...") di_extracted_pages = [] - try: - di_extracted_pages = extract_content_with_azure_di(chunk_path) - num_di_pages = len(di_extracted_pages) - conceptual_pages = num_di_pages if not is_image else 1 # Image is one conceptual item - - if not di_extracted_pages and not is_image: - print(f"Warning: Azure DI returned no content pages for {chunk_effective_filename}.") - status_msg = f"Azure DI found no content in {chunk_effective_filename}." - # Update page count to 0 if nothing found, otherwise keep previous estimate or conceptual count - update_callback(number_of_pages=0 if idx == num_file_chunks else conceptual_pages, status=status_msg) - elif not di_extracted_pages and is_image: - print(f"Info: Azure DI processed image {chunk_effective_filename}, but extracted no text.") - update_callback(number_of_pages=conceptual_pages, status=f"Processed image {chunk_effective_filename} (no text found).") - else: - update_callback(number_of_pages=conceptual_pages, status=f"Received {num_di_pages} content page(s)/slide(s) from Azure DI for {chunk_effective_filename}.") + if is_legacy_doc: + update_callback(status=f"Extracting legacy Word content from {chunk_effective_filename}...") + try: + extracted_text = extract_word_text(chunk_path, file_ext) + if extracted_text and extracted_text.strip(): + di_extracted_pages = [{ + "page_number": 1, + "content": extracted_text, + }] + update_callback(number_of_pages=1, status=f"Extracted legacy Word content from {chunk_effective_filename}.") + else: + print(f"Warning: Legacy Word extractor returned no content for {chunk_effective_filename}.") + update_callback(number_of_pages=0, status=f"Legacy Word extractor found no content in {chunk_effective_filename}.") + except Exception as e: + raise Exception(f"Error extracting content from {chunk_effective_filename} with the legacy Word extractor: {str(e)}") + elif is_legacy_ppt: + update_callback(status=f"Extracting legacy PowerPoint content from {chunk_effective_filename}...") + try: + di_extracted_pages = extract_legacy_ppt_pages(chunk_path) + total_slides = len(di_extracted_pages) + update_callback(number_of_pages=total_slides, status=f"Extracted legacy PowerPoint content from {chunk_effective_filename}.") + except Exception as e: + raise Exception(f"Error extracting content from {chunk_effective_filename} with the legacy PowerPoint extractor: {str(e)}") + else: + # Send chunk to Azure DI + update_callback(status=f"Sending {chunk_effective_filename} to Azure Document Intelligence...") + try: + di_extracted_pages = extract_content_with_azure_di(chunk_path) + num_di_pages = len(di_extracted_pages) + conceptual_pages = num_di_pages if not is_image else 1 # Image is one conceptual item + + if not di_extracted_pages and not is_image: + print(f"Warning: Azure DI returned no content pages for {chunk_effective_filename}.") + status_msg = f"Azure DI found no content in {chunk_effective_filename}." + # Update page count to 0 if nothing found, otherwise keep previous estimate or conceptual count + update_callback(number_of_pages=0 if idx == num_file_chunks else conceptual_pages, status=status_msg) + elif not di_extracted_pages and is_image: + print(f"Info: Azure DI processed image {chunk_effective_filename}, but extracted no text.") + update_callback(number_of_pages=conceptual_pages, status=f"Processed image {chunk_effective_filename} (no text found).") + else: + update_callback(number_of_pages=conceptual_pages, status=f"Received {num_di_pages} content page(s)/slide(s) from Azure DI for {chunk_effective_filename}.") - except Exception as e: - raise Exception(f"Error extracting content from {chunk_effective_filename} with Azure DI: {str(e)}") + except Exception as e: + raise Exception(f"Error extracting content from {chunk_effective_filename} with Azure DI: {str(e)}") # --- Multi-Modal Vision Analysis (for images only) - Must happen BEFORE save_chunks --- if is_image and enable_enhanced_citations and idx == 1: # Only run once for first chunk @@ -6553,7 +6579,7 @@ def update_doc_callback(**kwargs): update_doc_callback(status=f"Processing file {original_filename}, type: {file_ext}") # --- 1. Dispatch to appropriate handler based on file type --- - # Note: .doc and .docm are handled separately by process_doc() using docx2txt + # Note: .doc uses the shared document pipeline with OLE extraction, while .docm stays on the direct Word-text path. is_group = group_id is not None @@ -6562,7 +6588,7 @@ def update_doc_callback(**kwargs): "user_id": user_id, "temp_file_path": temp_file_path, "original_filename": original_filename, - "file_ext": file_ext if file_ext in tabular_extensions or file_ext in di_supported_extensions else None, + "file_ext": file_ext if file_ext in tabular_extensions or file_ext in di_supported_extensions or file_ext == '.doc' else None, "enable_enhanced_citations": enable_enhanced_citations, "update_callback": update_doc_callback } @@ -6597,7 +6623,7 @@ def update_doc_callback(**kwargs): total_chunks_saved, total_embedding_tokens, embedding_model_name = result else: total_chunks_saved = result - elif file_ext in ('.doc', '.docm'): + elif file_ext == '.docm': result = process_doc(**{k: v for k, v in args.items() if k != "file_ext"}) if isinstance(result, tuple) and len(result) == 3: total_chunks_saved, total_embedding_tokens, embedding_model_name = result @@ -6647,7 +6673,7 @@ def update_doc_callback(**kwargs): group_id=group_id, public_workspace_id=public_workspace_id ) - elif file_ext in di_supported_extensions: + elif file_ext in di_supported_extensions or file_ext == '.doc': result = process_di_document(**args) # Handle tuple return (chunks, tokens, model_name) if isinstance(result, tuple) and len(result) == 3: @@ -6722,7 +6748,11 @@ def update_doc_callback(**kwargs): embedding_tokens=total_embedding_tokens, embedding_model=embedding_model_name, version=doc_metadata.get('version') if doc_metadata else None, - author=doc_metadata.get('author') if doc_metadata else None, + author=( + doc_metadata.get('author') + or ', '.join(ensure_list(doc_metadata.get('authors'))) + or None + ) if doc_metadata else None, title=doc_metadata.get('title') if doc_metadata else None, subject=doc_metadata.get('subject') if doc_metadata else None, publication_date=doc_metadata.get('publication_date') if doc_metadata else None, diff --git a/application/single_app/requirements.txt b/application/single_app/requirements.txt index a01ab60c..48aa0877 100644 --- a/application/single_app/requirements.txt +++ b/application/single_app/requirements.txt @@ -8,6 +8,7 @@ Werkzeug==3.1.6 requests==2.33.0 openai==1.109.1 docx2txt==0.8 +olefile==0.47 Markdown==3.8.1 bleach==6.1.0 azure-cosmos==4.9.0 diff --git a/application/single_app/route_backend_chats.py b/application/single_app/route_backend_chats.py index 8bd44f01..db6ff619 100644 --- a/application/single_app/route_backend_chats.py +++ b/application/single_app/route_backend_chats.py @@ -7771,7 +7771,10 @@ def result_requires_message_reload(result: Any) -> bool: final_api_source_refs.insert(insert_idx, 'system:default_prompt') default_system_prompt_inserted = True - if not original_hybrid_search_enabled: + if should_apply_history_grounding_message( + original_hybrid_search_enabled, + prior_grounded_document_refs, + ): history_grounding_message = build_history_grounding_system_message() insert_idx = 0 if ( @@ -10214,7 +10217,10 @@ def publish_live_plugin_thought(thought_payload): final_api_source_refs.insert(insert_idx, 'system:default_prompt') default_system_prompt_inserted = True - if not original_hybrid_search_enabled: + if should_apply_history_grounding_message( + original_hybrid_search_enabled, + prior_grounded_document_refs, + ): history_grounding_message = build_history_grounding_system_message() insert_idx = 0 if ( @@ -11632,6 +11638,14 @@ def build_history_grounding_system_message(): } +def should_apply_history_grounding_message( + original_hybrid_search_enabled, + prior_grounded_document_refs, +): + """Apply bounded grounding only when prior grounded docs exist for this conversation.""" + return (not bool(original_hybrid_search_enabled)) and bool(prior_grounded_document_refs) + + def build_assistant_history_content_with_citations(message, content): base_content = str(content or '').strip() citation_sections = [] diff --git a/application/single_app/route_frontend_chats.py b/application/single_app/route_frontend_chats.py index c5590c4b..2679f57b 100644 --- a/application/single_app/route_frontend_chats.py +++ b/application/single_app/route_frontend_chats.py @@ -538,12 +538,11 @@ def upload_file(): # Continue without vision analysis elif file_ext_nodot in {'doc', 'docm'}: - # Use docx2txt for .doc and .docm files + # Use OLE parsing for legacy .doc files and docx2txt for .docm files try: - import docx2txt - extracted_content = docx2txt.process(temp_file_path) - except ImportError: - return jsonify({'error': 'docx2txt library required for .doc/.docm files'}), 500 + extracted_content = extract_word_text(temp_file_path, f'.{file_ext_nodot}') + except Exception as e: + return jsonify({'error': f'Error extracting text from {filename}: {e}'}), 500 elif file_ext_nodot == 'txt': extracted_content = extract_text_file(temp_file_path) elif file_ext_nodot == 'md': diff --git a/application/single_app/templates/profile.html b/application/single_app/templates/profile.html index 685315e7..5e6dd0a9 100644 --- a/application/single_app/templates/profile.html +++ b/application/single_app/templates/profile.html @@ -135,6 +135,52 @@ animation: spin 1s linear infinite; } + .preference-card-icon { + align-items: center; + background: linear-gradient(135deg, rgba(13, 110, 253, 0.12), rgba(13, 202, 240, 0.18)); + border-radius: 14px; + color: var(--bs-primary); + display: inline-flex; + font-size: 1.25rem; + height: 52px; + justify-content: center; + width: 52px; + } + + .preference-status { + min-height: 1.5rem; + } + + .fact-memory-summary-card { + background: linear-gradient(135deg, rgba(13, 110, 253, 0.08), rgba(13, 202, 240, 0.12)); + border: 1px solid rgba(13, 110, 253, 0.14); + border-radius: 16px; + padding: 1.25rem; + } + + .fact-memory-count { + font-size: 2rem; + font-weight: 700; + line-height: 1; + } + + .fact-memory-modal-list { + max-height: 55vh; + overflow-y: auto; + } + + .fact-memory-modal-card { + border: 1px solid var(--bs-border-color); + border-radius: 14px; + padding: 1rem; + background: var(--bs-body-bg); + } + + .fact-memory-pagination-summary { + min-height: 1.5rem; + } + + .preference-card-icon { align-items: center; background: linear-gradient(135deg, rgba(13, 110, 253, 0.12), rgba(13, 202, 240, 0.18)); @@ -452,6 +498,117 @@
Fact Memory
+
+
+
+ + + +
+
Tutorial Preferences
+

Control whether the floating guided tutorial buttons appear on Chat and Personal Workspace for your account.

+

These launchers are shown by default. You can hide them now and turn them back on later from this page.

+
+
+ {% if app_settings.enable_support_menu and app_settings.enable_support_latest_features and app_settings.support_latest_features_has_visible_items %} + + View Latest Feature Notes + + {% endif %} +
+ +
+
+
+ + + Turn this off if you already know the interface and want to remove the floating walkthrough launchers. +
+
+
+ +
+
+ +
+
+ +
+
+
+ + + +
+
Fact Memory
+

Save durable facts for your account, then manage the full memory list from a compact popup editor.

+ {% if app_settings.enable_fact_memory_plugin %} + Enabled by admin + Supported chat and mini-SK flows can use these memories when fact memory is enabled. + {% else %} + Disabled by admin + You can still manage saved memories here. They will stay inactive until an administrator turns fact memory back on. + {% endif %} +
+
+
+ +
+
+ + +
+
+ + +
+
+ Use instruction memories for durable preferences like tone or formatting. Use fact memories for details about you that the model should recall only when relevant. + +
+
+
+
+
Saved Memories
+
0
+
No memories saved yet.
+
0 instructions, 0 facts
+
+
+ + +
+ Use the popup editor for search, paging, and updates. +
+
+
+ +
+
+ {% if app_settings.enable_retention_policy_personal %}
@@ -804,6 +961,88 @@
+ + + +