PyLex/main.py at develop · dweng0/PyLex · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
"""
A command-line tool for tokenizing an input file using a specified lexer configuration.

This script reads an input file and a YAML lexer configuration file, tokenizes the input
using the provided lexer, and prints the resulting tokens.

Usage:
    python main.py <input_file> <lexer_config>

Arguments:
    input_file     The path to the input file to be tokenized.
    lexer_config   The path to the YAML lexer configuration file.

Example:
    python main.py source_code.js lexers/javascript.yaml
"""

import argparse
import yaml
import sys

from lexers.lexer import process_tokens, precompile_patterns
from tokenizer.tokenizer import tokenize

def read_file(file_path):
    """
    Read the contents of a file and return it as a string.

    Parameters:
        file_path (str): The path to the file to read.

    Returns:
        str: The contents of the file.

    Raises:
        IOError: If the file cannot be opened or read.
    """
    try:
        with open(file_path, encoding="utf-8") as file:
            read_data = file.read()
        return read_data
    except FileNotFoundError:
        print(f"Error: File not found: {file_path}", file=sys.stderr)
        sys.exit(1)
    except PermissionError:
        print(f"Error: Permission denied: {file_path}", file=sys.stderr)
        sys.exit(1)
    except IOError as e:
        print(f"Error: Cannot read file {file_path}: {e}", file=sys.stderr)
        sys.exit(1)

def read_lexer_config(config_file_name):
    """
    Read a YAML configuration file for the lexer and return the configuration data.

    Parameters:
        config_file_name (str): The path to the YAML configuration file.

    Returns:
        dict: The lexer configuration data parsed from the YAML file.

    Raises:
        IOError: If the file cannot be opened or read.
        yaml.YAMLError: If there is an error parsing the YAML file.
    """
    try:
        with open(config_file_name, 'r') as file:
            data = yaml.safe_load(file)
        return data
    except FileNotFoundError:
        print(f"Error: Lexer config file not found: {config_file_name}", file=sys.stderr)
        sys.exit(1)
    except PermissionError:
        print(f"Error: Permission denied: {config_file_name}", file=sys.stderr)
        sys.exit(1)
    except yaml.YAMLError as e:
        print(f"Error: Invalid YAML in lexer config: {e}", file=sys.stderr)
        sys.exit(1)
    except IOError as e:
        print(f"Error: Cannot read lexer config {config_file_name}: {e}", file=sys.stderr)
        sys.exit(1)

def main():
    """
    The main entry point of the script.

    Parses command-line arguments, reads the input file and lexer configuration,
    tokenizes the input text using the lexer, and prints the resulting tokens.
    """

    parser = argparse.ArgumentParser(description="Tokenize an input file using a lexer configuration.")

    parser.add_argument('input_file', type=str, help='The path to the input file to be tokenized.')
    parser.add_argument('lexer_config', type=str, help='The path to the YAML lexer configuration file.')

    args = parser.parse_args()

    input_text = read_file(args.input_file)
    lexer_config = read_lexer_config(args.lexer_config)

    # Precompile regex patterns for efficient tokenization
    precompile_patterns(lexer_config)

    tokens = tokenize(input_text, lexer_config, process_tokens)

    print(tokens)

if __name__ == "__main__":
    main()