Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
431 changes: 431 additions & 0 deletions application/single_app/functions_content.py

Large diffs are not rendered by default.

128 changes: 79 additions & 49 deletions application/single_app/functions_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -4535,16 +4535,19 @@ def process_log(document_id, user_id, temp_file_path, original_filename, enable_

def process_doc(document_id, user_id, temp_file_path, original_filename, enable_enhanced_citations, update_callback, group_id=None, public_workspace_id=None):
"""
Processes .doc and .docm files using docx2txt library.
Processes legacy .doc files via OLE piece tables and .docm files via docx2txt.
Note: .docx files still use Document Intelligence for better formatting preservation.
"""
is_group = group_id is not None
is_public_workspace = public_workspace_id is not None

update_callback(status=f"Processing {original_filename.split('.')[-1].upper()} file...")
total_chunks_saved = 0
total_embedding_tokens = 0
embedding_model_name = None
chunk_config = get_chunk_size_config(get_settings())
target_words_per_chunk = chunk_config.get('doc', {}).get('value', 400) # Consistent with other text-based chunking
file_ext = os.path.splitext(original_filename)[1].lower().lstrip('.')
target_words_per_chunk = chunk_config.get(file_ext, {}).get('value', 400)

if enable_enhanced_citations:
args = {
Expand All @@ -4563,15 +4566,8 @@ def process_doc(document_id, user_id, temp_file_path, original_filename, enable_
upload_to_blob(**args)

try:
# Import docx2txt here to avoid dependency issues if not installed
try:
import docx2txt
except ImportError:
raise Exception("docx2txt library is required for .doc and .docm file processing. Install with: pip install docx2txt")

# Extract text from .doc or .docm file
try:
text_content = docx2txt.process(temp_file_path)
text_content = extract_word_text(temp_file_path, f'.{file_ext}')
except Exception as e:
raise Exception(f"Error extracting text from {original_filename}: {e}")

Expand Down Expand Up @@ -4893,15 +4889,19 @@ def process_log(document_id, user_id, temp_file_path, original_filename, enable_

def process_doc(document_id, user_id, temp_file_path, original_filename, enable_enhanced_citations, update_callback, group_id=None, public_workspace_id=None):
"""
Processes .doc and .docm files using docx2txt library.
Processes legacy .doc files via OLE piece tables and .docm files via docx2txt.
Note: .docx files still use Document Intelligence for better formatting preservation.
"""
is_group = group_id is not None
is_public_workspace = public_workspace_id is not None

update_callback(status=f"Processing {original_filename.split('.')[-1].upper()} file...")
total_chunks_saved = 0
target_words_per_chunk = 400 # Consistent with other text-based chunking
total_embedding_tokens = 0
embedding_model_name = None
chunk_config = get_chunk_size_config(get_settings())
file_ext = os.path.splitext(original_filename)[1].lower().lstrip('.')
target_words_per_chunk = chunk_config.get(file_ext, {}).get('value', 400)

if enable_enhanced_citations:
args = {
Expand All @@ -4920,15 +4920,8 @@ def process_doc(document_id, user_id, temp_file_path, original_filename, enable_
upload_to_blob(**args)

try:
# Import docx2txt here to avoid dependency issues if not installed
try:
import docx2txt
except ImportError:
raise Exception("docx2txt library is required for .doc and .docm file processing. Install with: pip install docx2txt")

# Extract text from .doc or .docm file
try:
text_content = docx2txt.process(temp_file_path)
text_content = extract_word_text(temp_file_path, f'.{file_ext}')
except Exception as e:
raise Exception(f"Error extracting text from {original_filename}: {e}")

Expand Down Expand Up @@ -4969,13 +4962,18 @@ def process_doc(document_id, user_id, temp_file_path, original_filename, enable_
elif is_group:
args["group_id"] = group_id

save_chunks(**args)
token_usage = save_chunks(**args)
total_chunks_saved += 1

if token_usage:
total_embedding_tokens += token_usage.get('total_tokens', 0)
if not embedding_model_name:
embedding_model_name = token_usage.get('model_deployment_name')

except Exception as e:
raise Exception(f"Failed processing {original_filename}: {e}")

return total_chunks_saved
return total_chunks_saved, total_embedding_tokens, embedding_model_name

def process_html(document_id, user_id, temp_file_path, original_filename, enable_enhanced_citations, update_callback, group_id=None, public_workspace_id=None):
"""Processes HTML files."""
Expand Down Expand Up @@ -5851,8 +5849,10 @@ def process_di_document(document_id, user_id, temp_file_path, original_filename,
page_count = 0 # For PDF pre-check

is_pdf = file_ext == '.pdf'
is_word = file_ext in ('.docx', '.doc')
is_word = file_ext in ('.docx', '.doc', '.docm')
is_legacy_doc = file_ext == '.doc'
is_ppt = file_ext in ('.pptx', '.ppt')
is_legacy_ppt = file_ext == '.ppt'
is_image = file_ext in tuple('.' + ext for ext in IMAGE_EXTENSIONS)

try:
Expand All @@ -5861,9 +5861,11 @@ def process_di_document(document_id, user_id, temp_file_path, original_filename,
doc_authors_list = parse_authors(doc_author)
page_count = get_pdf_page_count(temp_file_path)
elif is_word:
doc_title, doc_author = extract_docx_metadata(temp_file_path)
doc_title, doc_author = extract_word_metadata(temp_file_path, file_ext)
doc_authors_list = parse_authors(doc_author)
elif is_ppt:
doc_title, doc_author, doc_subject, doc_keywords = extract_presentation_metadata(temp_file_path, file_ext)
doc_authors_list = parse_authors(doc_author)
# PPT and Image metadata extraction might be added here if needed/possible

update_fields = {'status': "Extracted initial metadata"}
if doc_title: update_fields['title'] = doc_title
Expand Down Expand Up @@ -5940,27 +5942,51 @@ def process_di_document(document_id, user_id, temp_file_path, original_filename,

upload_to_blob(**args)

# Send chunk to Azure DI
update_callback(status=f"Sending {chunk_effective_filename} to Azure Document Intelligence...")
di_extracted_pages = []
try:
di_extracted_pages = extract_content_with_azure_di(chunk_path)
num_di_pages = len(di_extracted_pages)
conceptual_pages = num_di_pages if not is_image else 1 # Image is one conceptual item

if not di_extracted_pages and not is_image:
print(f"Warning: Azure DI returned no content pages for {chunk_effective_filename}.")
status_msg = f"Azure DI found no content in {chunk_effective_filename}."
# Update page count to 0 if nothing found, otherwise keep previous estimate or conceptual count
update_callback(number_of_pages=0 if idx == num_file_chunks else conceptual_pages, status=status_msg)
elif not di_extracted_pages and is_image:
print(f"Info: Azure DI processed image {chunk_effective_filename}, but extracted no text.")
update_callback(number_of_pages=conceptual_pages, status=f"Processed image {chunk_effective_filename} (no text found).")
else:
update_callback(number_of_pages=conceptual_pages, status=f"Received {num_di_pages} content page(s)/slide(s) from Azure DI for {chunk_effective_filename}.")
if is_legacy_doc:
update_callback(status=f"Extracting legacy Word content from {chunk_effective_filename}...")
try:
extracted_text = extract_word_text(chunk_path, file_ext)
if extracted_text and extracted_text.strip():
di_extracted_pages = [{
"page_number": 1,
"content": extracted_text,
}]
update_callback(number_of_pages=1, status=f"Extracted legacy Word content from {chunk_effective_filename}.")
else:
print(f"Warning: Legacy Word extractor returned no content for {chunk_effective_filename}.")
update_callback(number_of_pages=0, status=f"Legacy Word extractor found no content in {chunk_effective_filename}.")
except Exception as e:
raise Exception(f"Error extracting content from {chunk_effective_filename} with the legacy Word extractor: {str(e)}")
elif is_legacy_ppt:
update_callback(status=f"Extracting legacy PowerPoint content from {chunk_effective_filename}...")
try:
di_extracted_pages = extract_legacy_ppt_pages(chunk_path)
total_slides = len(di_extracted_pages)
update_callback(number_of_pages=total_slides, status=f"Extracted legacy PowerPoint content from {chunk_effective_filename}.")
except Exception as e:
raise Exception(f"Error extracting content from {chunk_effective_filename} with the legacy PowerPoint extractor: {str(e)}")
else:
# Send chunk to Azure DI
update_callback(status=f"Sending {chunk_effective_filename} to Azure Document Intelligence...")
try:
di_extracted_pages = extract_content_with_azure_di(chunk_path)
num_di_pages = len(di_extracted_pages)
conceptual_pages = num_di_pages if not is_image else 1 # Image is one conceptual item

if not di_extracted_pages and not is_image:
print(f"Warning: Azure DI returned no content pages for {chunk_effective_filename}.")
status_msg = f"Azure DI found no content in {chunk_effective_filename}."
# Update page count to 0 if nothing found, otherwise keep previous estimate or conceptual count
update_callback(number_of_pages=0 if idx == num_file_chunks else conceptual_pages, status=status_msg)
elif not di_extracted_pages and is_image:
print(f"Info: Azure DI processed image {chunk_effective_filename}, but extracted no text.")
update_callback(number_of_pages=conceptual_pages, status=f"Processed image {chunk_effective_filename} (no text found).")
else:
update_callback(number_of_pages=conceptual_pages, status=f"Received {num_di_pages} content page(s)/slide(s) from Azure DI for {chunk_effective_filename}.")

except Exception as e:
raise Exception(f"Error extracting content from {chunk_effective_filename} with Azure DI: {str(e)}")
except Exception as e:
raise Exception(f"Error extracting content from {chunk_effective_filename} with Azure DI: {str(e)}")

# --- Multi-Modal Vision Analysis (for images only) - Must happen BEFORE save_chunks ---
if is_image and enable_enhanced_citations and idx == 1: # Only run once for first chunk
Expand Down Expand Up @@ -6553,7 +6579,7 @@ def update_doc_callback(**kwargs):
update_doc_callback(status=f"Processing file {original_filename}, type: {file_ext}")

# --- 1. Dispatch to appropriate handler based on file type ---
# Note: .doc and .docm are handled separately by process_doc() using docx2txt
# Note: .doc uses the shared document pipeline with OLE extraction, while .docm stays on the direct Word-text path.

is_group = group_id is not None

Expand All @@ -6562,7 +6588,7 @@ def update_doc_callback(**kwargs):
"user_id": user_id,
"temp_file_path": temp_file_path,
"original_filename": original_filename,
"file_ext": file_ext if file_ext in tabular_extensions or file_ext in di_supported_extensions else None,
"file_ext": file_ext if file_ext in tabular_extensions or file_ext in di_supported_extensions or file_ext == '.doc' else None,
"enable_enhanced_citations": enable_enhanced_citations,
"update_callback": update_doc_callback
}
Expand Down Expand Up @@ -6597,7 +6623,7 @@ def update_doc_callback(**kwargs):
total_chunks_saved, total_embedding_tokens, embedding_model_name = result
else:
total_chunks_saved = result
elif file_ext in ('.doc', '.docm'):
elif file_ext == '.docm':
result = process_doc(**{k: v for k, v in args.items() if k != "file_ext"})
if isinstance(result, tuple) and len(result) == 3:
total_chunks_saved, total_embedding_tokens, embedding_model_name = result
Expand Down Expand Up @@ -6647,7 +6673,7 @@ def update_doc_callback(**kwargs):
group_id=group_id,
public_workspace_id=public_workspace_id
)
elif file_ext in di_supported_extensions:
elif file_ext in di_supported_extensions or file_ext == '.doc':
result = process_di_document(**args)
# Handle tuple return (chunks, tokens, model_name)
if isinstance(result, tuple) and len(result) == 3:
Expand Down Expand Up @@ -6722,7 +6748,11 @@ def update_doc_callback(**kwargs):
embedding_tokens=total_embedding_tokens,
embedding_model=embedding_model_name,
version=doc_metadata.get('version') if doc_metadata else None,
author=doc_metadata.get('author') if doc_metadata else None,
author=(
doc_metadata.get('author')
or ', '.join(ensure_list(doc_metadata.get('authors')))
or None
) if doc_metadata else None,
title=doc_metadata.get('title') if doc_metadata else None,
subject=doc_metadata.get('subject') if doc_metadata else None,
publication_date=doc_metadata.get('publication_date') if doc_metadata else None,
Expand Down
1 change: 1 addition & 0 deletions application/single_app/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ Werkzeug==3.1.6
requests==2.33.0
openai==1.109.1
docx2txt==0.8
olefile==0.47
Markdown==3.8.1
bleach==6.1.0
azure-cosmos==4.9.0
Expand Down
18 changes: 16 additions & 2 deletions application/single_app/route_backend_chats.py
Original file line number Diff line number Diff line change
Expand Up @@ -7771,7 +7771,10 @@ def result_requires_message_reload(result: Any) -> bool:
final_api_source_refs.insert(insert_idx, 'system:default_prompt')
default_system_prompt_inserted = True

if not original_hybrid_search_enabled:
if should_apply_history_grounding_message(
original_hybrid_search_enabled,
prior_grounded_document_refs,
):
history_grounding_message = build_history_grounding_system_message()
insert_idx = 0
if (
Expand Down Expand Up @@ -10214,7 +10217,10 @@ def publish_live_plugin_thought(thought_payload):
final_api_source_refs.insert(insert_idx, 'system:default_prompt')
default_system_prompt_inserted = True

if not original_hybrid_search_enabled:
if should_apply_history_grounding_message(
original_hybrid_search_enabled,
prior_grounded_document_refs,
):
history_grounding_message = build_history_grounding_system_message()
insert_idx = 0
if (
Expand Down Expand Up @@ -11632,6 +11638,14 @@ def build_history_grounding_system_message():
}


def should_apply_history_grounding_message(
original_hybrid_search_enabled,
prior_grounded_document_refs,
):
"""Apply bounded grounding only when prior grounded docs exist for this conversation."""
return (not bool(original_hybrid_search_enabled)) and bool(prior_grounded_document_refs)


def build_assistant_history_content_with_citations(message, content):
base_content = str(content or '').strip()
citation_sections = []
Expand Down
9 changes: 4 additions & 5 deletions application/single_app/route_frontend_chats.py
Original file line number Diff line number Diff line change
Expand Up @@ -538,12 +538,11 @@ def upload_file():
# Continue without vision analysis

elif file_ext_nodot in {'doc', 'docm'}:
# Use docx2txt for .doc and .docm files
# Use OLE parsing for legacy .doc files and docx2txt for .docm files
try:
import docx2txt
extracted_content = docx2txt.process(temp_file_path)
except ImportError:
return jsonify({'error': 'docx2txt library required for .doc/.docm files'}), 500
extracted_content = extract_word_text(temp_file_path, f'.{file_ext_nodot}')
except Exception as e:
return jsonify({'error': f'Error extracting text from {filename}: {e}'}), 500
elif file_ext_nodot == 'txt':
extracted_content = extract_text_file(temp_file_path)
elif file_ext_nodot == 'md':
Expand Down
Loading
Loading