-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathJSONgemini.py
More file actions
92 lines (72 loc) · 3.58 KB
/
JSONgemini.py
File metadata and controls
92 lines (72 loc) · 3.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import docx
import json
import os
def docx_to_json(docx_path, output_path):
"""Converts a Word document to JSON and saves it."""
document_data = [] # Define document_data here
try:
doc = docx.Document(docx_path)
current_section = None
current_subsection = None
current_subsubsection = None
for paragraph in doc.paragraphs:
style_name = paragraph.style.name
text = paragraph.text.strip()
if not text:
continue
if style_name.startswith("Heading 1"):
current_section = {"title": text, "content": []}
document_data.append(current_section)
current_subsection = None
current_subsubsection = None
elif style_name.startswith("Heading 2"):
if current_section is None:
current_section = {"title": "Untitled Section", "content": []}
document_data.append(current_section)
current_subsection = {"title": text, "content": []}
current_section["content"].append(current_subsection)
current_subsubsection = None
elif style_name.startswith("Heading 3"):
if current_subsection is None:
current_subsection = {"title": "Untitled Subsection", "content": []}
current_section["content"].append(current_subsection)
current_subsubsection = {"title": text, "content": []}
current_subsection["content"].append(current_subsubsection)
elif style_name.startswith("Normal") or style_name.startswith("Body Text"):
content_item = {"type": "paragraph", "text": text}
if current_subsubsection:
current_subsubsection["content"].append(content_item)
elif current_subsection:
current_subsection["content"].append(content_item)
elif current_section:
current_section["content"].append(content_item)
else:
if not document_data:
current_section = {"title": "Introduction", "content": []}
document_data.append(current_section)
current_section["content"].append(content_item)
json_output = json.dumps(document_data, indent=4, ensure_ascii=False)
if json_output:
with open(output_path, "w", encoding="utf-8") as f:
f.write(json_output)
print(f"Conversion successful. JSON output saved to {output_path}")
else:
print(f"Conversion failed for {docx_path}")
except docx.opc.exceptions.PackageNotFoundError:
print(f"Error: File not found: {docx_path}")
except Exception as e:
print(f"An error occurred: {e} while processing {docx_path}")
def process_directory(input_dir, output_dir):
if not os.path.exists(output_dir):
os.makedirs(output_dir)
for filename in os.listdir(input_dir):
if filename.endswith(".docx") or filename.endswith(".doc"):
if not filename.startswith("~$"): # Skip temporary files!!!
input_path = os.path.join(input_dir, filename)
output_filename = os.path.splitext(filename)[0] + ".json"
output_path = os.path.join(output_dir, output_filename)
docx_to_json(input_path, output_path)
# Example usage:
input_directory = "/Users/km/Documents/Convert"
output_directory = "/Users/km/Documents/JSON"
process_directory(input_directory, output_directory)