-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathutils.py
More file actions
38 lines (31 loc) · 1.25 KB
/
utils.py
File metadata and controls
38 lines (31 loc) · 1.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import re
from collections import defaultdict
gender_tags = {'m1', 'm2', 'm3', 'f', 'n'}
number_tags = {'pl', 'sg'}
case_tags = {'nom', 'gen', 'dat', 'acc', 'inst', 'loc', 'voc'}
def extract_gnc(interpretation):
gender = next((token for token in interpretation if token in gender_tags), None)
number = next((token for token in interpretation if token in number_tags), None)
case = next((token for token in interpretation if token in case_tags), None)
pos = interpretation[0]
gnc = ''.join(list(filter(None, [gender, number, case])))
if gnc:
return "{0}:{1}".format(pos, gnc)
return pos
def split_interpretation(interpretation):
interpretation = interpretation.split(':')
base = interpretation[0]
pos = interpretation[1]
gnc = extract_gnc(interpretation[1:])
return base, pos, gnc
def remove_nonalpha(string):
pattern = re.compile('[\W_]+', re.UNICODE)
return pattern.sub('', string)
def contract_whitespace(string):
pattern = re.compile('\s+')
return pattern.sub(' ', string)
def is_num(orth):
roman = ['ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x']
return orth in roman or any(char.isdigit() for char in orth)
def is_valid(line):
return len(line) > 4 and line.count('num') < 3