Classes and functions to do syntactic analysis on giellatekno xml docs.
Analyse a file if it is not ocr'ed.
Source code in /home/anders/projects/CorpusTools/corpustools/analyser.py
105
106
107
108
109
110
111
112
113
114
115
116 | def analyse(xml_file, modename):
"""Analyse a file if it is not ocr'ed."""
try:
path = corpuspath.make_corpus_path(xml_file)
if not path.metadata.get_variable("ocr"):
dependency_analysis(path, modename)
else:
print(xml_file, "is an OCR file and will not be analysed", file=sys.stderr)
except (etree.XMLSyntaxError, UserWarning) as error:
print("Can not parse", xml_file, file=sys.stderr)
print("The error was:", str(error), file=sys.stderr)
|
Analyse file in parallel.
Source code in /home/anders/projects/CorpusTools/corpustools/analyser.py
119
120
121
122
123
124
125
126
127
128 | def analyse_in_parallel(file_list, modename, pool_size):
"""Analyse file in parallel."""
file_list = list(file_list)
print(f"Parallel analysis of {len(file_list)} files with {pool_size} workers")
util.run_in_parallel(
function=analyse,
max_workers=pool_size,
file_list=file_list,
modename=modename,
)
|
Analyse files one by one.
Source code in /home/anders/projects/CorpusTools/corpustools/analyser.py
131
132
133
134
135
136
137
138
139
140
141
142
143
144 | def analyse_serially(file_list, modename):
"""Analyse files one by one."""
xml_files = list(file_list)
print(f"Starting the analysis of {len(xml_files)} files")
fileno = 0
for xml_file in xml_files:
fileno += 1
# print some ugly banner cos i want to see progress on local
# batch job
util.print_frame("*" * 79)
util.print_frame(f"Analysing {xml_file} [{fileno} of {len(xml_files)}]")
util.print_frame("*" * 79)
analyse(xml_file, modename)
|
Turn an xml formatted file into clean text.
Source code in /home/anders/projects/CorpusTools/corpustools/analyser.py
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68 | def ccatter(path):
"""Turn an xml formatted file into clean text."""
xml_printer = ccat.XMLPrinter(lang=path.lang, all_paragraphs=True)
xml_printer.parse_file(path.converted)
text = xml_printer.process_file().getvalue()
if text:
# Gruesome hack for mhr
# When https://github.com/giellalt/lang-mhr/issues/3
# is resolved, remove this
if path.lang == "mhr":
return " - ".join(part.strip() for part in text.split("-"))
# end of hack
return text
raise UserWarning(f"Empty file {path.converted}")
|
Insert disambiguation and dependency analysis into the body.
Source code in /home/anders/projects/CorpusTools/corpustools/analyser.py
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102 | def dependency_analysis(path, modename):
"""Insert disambiguation and dependency analysis into the body."""
xml_file = etree.parse(path.converted)
oldbody = xml_file.find(".//body")
parent = oldbody.getparent()
parent.remove(oldbody)
body = etree.SubElement(parent, "body")
dependency = etree.SubElement(body, "dependency")
dependency.text = etree.CDATA(
do_dependency_analysis(
ccatter(path),
modename=modename if modename is not None else get_modename(path),
lang=path.lang,
)
)
with util.ignored(OSError):
os.makedirs(os.path.dirname(path.analysed))
with open(path.analysed, "wb") as analysed_stream:
analysed_stream.write(
etree.tostring(
xml_file, xml_declaration=True, encoding="utf8", pretty_print=True
)
)
|
Insert disambiguation and dependency analysis into the body.
Source code in /home/anders/projects/CorpusTools/corpustools/analyser.py
| def do_dependency_analysis(text, modename, lang):
"""Insert disambiguation and dependency analysis into the body."""
pipeline = modes.Pipeline(modename, lang)
pipeline.sanity_check()
return pipeline.run(text.encode("utf8"))
|
Get the modename depending on the CorpusPath
Source code in /home/anders/projects/CorpusTools/corpustools/analyser.py
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51 | def get_modename(path):
"""Get the modename depending on the CorpusPath"""
if path.lang == "mhr":
year = path.metadata.get_variable("year")
if year:
if 1909 < int(year) < 1939:
return "hfst_thirties"
if path.lang == "mrj":
year = path.metadata.get_variable("year")
if year:
if 1909 < int(year) < 1939:
return "hfst_thirties"
if 1939 < int(year) < 1995:
return "hfst_eighties"
if path.lang in ["nob", "fin", "fao"]:
return "hfst_no_korp"
return "hfst"
|
main()
Analyse files in the given directories.
Source code in /home/anders/projects/CorpusTools/corpustools/analyser.py
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209 | def main():
"""Analyse files in the given directories."""
args = parse_options()
files = list(corpuspath.collect_files(args.converted_entities, suffix=".xml"))
if args.skip_existing:
non_skipped_files = []
for file in files:
cp = corpuspath.make_corpus_path(file)
if not cp.analysed.exists():
non_skipped_files.append(file)
n_skipped_files = len(files) - len(non_skipped_files)
print(f"--skip-existing given. Skipping {n_skipped_files} "
"files that are already analysed")
if n_skipped_files == len(files):
print("nothing to do, exiting")
raise SystemExit(0)
files = non_skipped_files
try:
if args.serial:
analyse_serially(files, args.modename)
else:
analyse_in_parallel(files, args.modename, args.ncpus)
except util.ArgumentError as error:
print(f"Cannot do analysis\n{str(error)}", file=sys.stderr)
raise SystemExit(1)
|
Parse the given options.
Source code in /home/anders/projects/CorpusTools/corpustools/analyser.py
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178 | def parse_options():
"""Parse the given options."""
parser = argparse.ArgumentParser(
parents=[argparse_version.parser], description="Analyse files in parallel."
)
parser.add_argument("--ncpus", action=NCpus, default=multiprocessing.cpu_count() * 2)
parser.add_argument(
"--skip-existing",
action="store_true",
help="Skip analysis of files are already analysed (that already "
"exist in the analysed/ folder"
)
parser.add_argument(
"--serial",
action="store_true",
help="When this argument is used files will be analysed one by one."
"Using --serial takes priority over --ncpus",
)
parser.add_argument(
"-k",
"--modename",
choices=modes.list_modes(),
help="You can set the analyser pipeline explicitely if you want.",
)
parser.add_argument(
"converted_entities",
nargs="+",
help="converted files or director(y|ies) where the converted files exist",
)
return parser.parse_args()
|