LLM_Chat_Interface/parse_doc.py at main · NHLBI/LLM_Chat_Interface · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
#!/usr/bin/python3
from docx import Document
from docx.table import Table
from docx.text.paragraph import Paragraph
import sys
from pdfminer.high_level import extract_text
from pptx import Presentation
from pptx.shapes.group import GroupShape
from pptx.enum.shapes import MSO_SHAPE_TYPE
import pandas as pd
import os

'''
This function will return text from a  file. Currently supports .txt, .md, .json, .xml, .docx,
.pptx, and .pdf files.

Input:  filepath (string) - the path to the file
Output: text (string) - the contents of the file
'''
def parse_doc(file, filename):

    # Check if file exists
    if not os.path.exists(file):
        raise ValueError('File does not exist')

    # Check if file is not empty
    if os.path.getsize(file) == 0:
        raise ValueError('File is empty')

    if filename.endswith('.txt') or filename.endswith('.md') or filename.endswith('.json') or filename.endswith('.xml'):
        return parse_txt(file, filename)
    elif filename.endswith('.docx'):
        return parse_docx(file, filename)
    elif filename.endswith('.pptx'):
        return parse_pptx(file, filename)
    elif filename.endswith('.pdf'):
        return parse_pdf(file, filename)
    elif filename.endswith('.csv') or filename.endswith('.xlsx'):
        return parse_csv(file, filename)
    else:
        raise ValueError('File type not supported')

'''
This function will return text from a pdf file. It does not ready any images. And it does not
read tables intelligently. It will simply read the text in the order it appears in the pdf.

Input:  filepath (string) - the path to the file
Output: text (string) - the contents of the file
'''
def parse_pdf(file, filename):


    # Check if file is a pdf
    if not filename.endswith('.pdf'):
        raise ValueError('File type not supported')

    output = extract_text(file)
    return output
    if output == "":
        return "The file returned no content"
    else:
        return output

'''
This function will return text from a docx file. It does not ready any images or headers/footers.

Input:  filepath (string) - the path to the file
Output: text (string) - the contents of the file
'''
def parse_docx(file, filename):

    # Check if file is a docx
    if not filename.endswith('.docx'):
        raise ValueError('File type not supported')

    # loads the document
    document = Document(file)

    # We will build a string of the text in the document
    text = ''

    # The docx package breaks the document into different parts. Here we iterate over the paragrphas
    # and tables in the document and add them to the string. We could revisit this and how we add
    # whitespace, etc.
    for item in document.iter_inner_content():
        if isinstance(item, Paragraph):
            text +=  item.text +'\n'
        elif isinstance(item, Table):
            text += 'Table'
            for row in item.rows:
                for cell in row.cells:
                    text += cell.text + '\t'
                text+='\n'

    # Potential TODO - read headers/footers

    return text

'''
Helper function for parse_pptx. This function will recursively check for text in a set of shapes.
'''
def check_recursively_for_text(this_set_of_shapes, text_run):
    for shape in this_set_of_shapes:

        # If this is a group, we have to call it recursively to get down to text/tables
        if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
            check_recursively_for_text(shape.shapes, text_run)
        else:
            if shape.has_text_frame:
                for paragraph in shape.text_frame.paragraphs:
                    for run in paragraph.runs:
                        text_run.append(run.text)
            elif shape.has_table:
                for row in shape.table.rows:
                    row_text = ''
                    for cell in row.cells:
                        row_text += cell.text + '\t'
                    text_run.append(row_text)
    return text_run

'''
This function will return text from a pptx file
'''
def parse_pptx(file, filename):

    # Check if file is a pptx
    if not filename.endswith('.pptx'):
        raise ValueError('File type not supported')

    # loads the presentation
    presentation = Presentation(file)

    # We will build a string of the text in the presentation by iterating over slides
    # and finding all text frames, tables, and groups. This skips images and other objects.
    text = []
    for slide in presentation.slides:
        text = check_recursively_for_text(slide.shapes, text)

    return '\n'.join(text)


'''
This function will return text from an ASCII file. Currently this accepts .txt, .md, .json, and .xml files.

Input:  filepath (string) - the path to the file
Output: contents (string) - the contents of the file
'''
def parse_txt(file, filename):

    # Check if file is a txt, md, json, or xml
    if not filename.endswith('.txt') and not filename.endswith('.md') and not filename.endswith('.json') and not filename.endswith('.xml'):
        raise ValueError('File type not supported')

    # Simply read characters of the file
    with open(file, 'r') as f:
        contents = f.read()
    return contents


'''
This funciton will return text from a csv file.

Input: file_path (string) - the path to the file
Output: csv_text (string) - the contents of the file
'''
def parse_csv(file, filename):

    if filename.endswith('.csv'):
        df = pd.read_csv(file)
        csv_json = df.to_json(orient='records', lines=True)
    elif filename.endswith('.xlsx'):
        df = pd.read_excel(file, engine='openpyxl', sheet_name=None)
        csv_json = ''
        for sheet_name, sheet_df in df.items():
            csv_json += f'--- Sheet: {sheet_name} ---\n'

            # Convert each sheet to JSON format
            csv_json += sheet_df.to_json(orient='records', lines=True)
            csv_json = csv_json + '\n' # Add a newline to separate sheets
    else:
        raise ValueError('File type not supported')

    #csv_text = "\n".join([f"Row {i + 1}: {line}" for i, line in enumerate(csv_json.splitlines())])

    # Add a pre-amble
    preamble = 'Below is Excel data in the form of json, broken down by tabs. Depnding on the ask, you may need to query the data. Ensure that all your calculations are correc, showing your thought process when applicable.'
    csv_json = preamble + '\n' + csv_json

    return csv_json

if __name__ == '__main__':
    text = parse_doc(sys.argv[1], sys.argv[2])
    print(text)