Skip to content

avvirconverter

Convert Ávvir-files to the Giella xml format.

convert2intermediate(filename)

Convert Ávvir xml files to the giellatekno xml format.

The root node in an Ávvir document is article. article nodes contains one or more story nodes. story nodes contain one or more p nodes. p nodes contain span, br and (since 2013) p nodes.

Source code in /home/anders/projects/CorpusTools/corpustools/avvirconverter.py
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
def convert2intermediate(filename):
    """Convert Ávvir xml files to the giellatekno xml format.

    The root node in an Ávvir document is article.
    article nodes contains one or more story nodes.
    story nodes contain one or more p nodes.
    p nodes contain span, br and (since 2013) p nodes.
    """
    avvir_doc = etree.parse(filename).getroot()

    remove_identical_ids(avvir_doc)
    convert_p(avvir_doc)
    convert_story(avvir_doc)
    fix_quotemarks(avvir_doc)

    return convert_article(avvir_doc)

convert_article(avvir_doc)

The root element of an Ávvir doc is article, rename it to body.

Parameters:

Name Type Description Default
avvir_doc etree.Element

the etree that should be manipulated.

required

Returns:

Type Description
etree.Element

The document root of the basic Giella xml document.

Source code in /home/anders/projects/CorpusTools/corpustools/avvirconverter.py
156
157
158
159
160
161
162
163
164
165
166
167
168
169
def convert_article(avvir_doc):
    """The root element of an Ávvir doc is article, rename it to body.

    Args:
        avvir_doc (etree.Element): the etree that should be manipulated.

    Returns:
        (etree.Element): The document root of the basic Giella xml document.
    """
    avvir_doc.tag = "body"
    document = etree.Element("document")
    document.append(avvir_doc)

    return document

convert_p(avvir_doc)

Convert story/p elements to one or more p elements.

Parameters:

Name Type Description Default
avvir_doc etree.Element

the etree that should be manipulated.

required
Source code in /home/anders/projects/CorpusTools/corpustools/avvirconverter.py
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
def convert_p(avvir_doc):
    """Convert story/p elements to one or more p elements.

    Args:
        avvir_doc (etree.Element): the etree that should be manipulated.
    """
    for para in avvir_doc.findall("./story/p"):
        if para.get("class") is not None:
            del para.attrib["class"]

        convert_sub_p(para)
        convert_subelement(para)

        if para.text is None or para.text.strip() == "":
            story = para.getparent()
            story.remove(para)

convert_story(avvir_doc)

Convert story elements in to giellatekno xml elements.

Parameters:

Name Type Description Default
avvir_doc etree.Element

the etree that should be manipulated.

required
Source code in /home/anders/projects/CorpusTools/corpustools/avvirconverter.py
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
def convert_story(avvir_doc):
    """Convert story elements in to giellatekno xml elements.

    Args:
        avvir_doc (etree.Element): the etree that should be manipulated.
    """
    for title in avvir_doc.findall('.//story[@class="Tittel"]'):
        for para in title.findall("./p"):
            para.set("type", "title")

        del title.attrib["class"]
        del title.attrib["id"]

        title.tag = "section"

    for title in avvir_doc.findall('.//story[@class="Undertittel"]'):
        for para in title.findall("./p"):
            para.set("type", "title")

        del title.attrib["class"]
        del title.attrib["id"]

        title.tag = "section"

    for story in avvir_doc.findall("./story"):
        parent = story.getparent()
        for i, para in enumerate(story.findall("./p")):
            parent.insert(parent.index(story) + i + 1, para)

        parent.remove(story)

convert_sub_p(para)

Convert p element found inside story/p elements.

These elements contain erroneous text that an editor has removed. This function removes p.text and saves p.tail

Parameters:

Name Type Description Default
para lxml.etree.Element

an lxml element, it is a story/p element

required
Source code in /home/anders/projects/CorpusTools/corpustools/avvirconverter.py
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
def convert_sub_p(para):
    """Convert p element found inside story/p elements.

    These elements contain erroneous text that an editor has removed.
    This function removes p.text and saves p.tail

    Args:
        para (lxml.etree.Element): an lxml element, it is a story/p element
    """
    for sub_p in para.findall(".//p"):
        previous = sub_p.getprevious()
        if previous is None:
            parent = sub_p.getparent()
            if sub_p.tail is not None:
                if parent.text is not None:
                    parent.text = parent.text + sub_p.tail
                else:
                    parent.text = sub_p.tail
        else:
            if sub_p.tail is not None:
                if previous.tail is not None:
                    previous.tail = previous.tail + sub_p.tail
                else:
                    previous.tail = sub_p.tail
        para.remove(sub_p)

convert_subelement(para)

Convert subelements of story/p elements to p elements.

Parameters:

Name Type Description Default
para lxml.etree.Element

an lxml element, it is a story/p element

required
Source code in /home/anders/projects/CorpusTools/corpustools/avvirconverter.py
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
def convert_subelement(para):
    """Convert subelements of story/p elements to p elements.

    Args:
        para (lxml.etree.Element): an lxml element, it is a story/p element
    """
    position = 1
    for subelement in para:
        position = insert_element(para, subelement.text, position)

        for subsubelement in subelement:
            for text in [subsubelement.text, subsubelement.tail]:
                position = insert_element(para, text, position)

        position = insert_element(para, subelement.tail, position)

        para.remove(subelement)

fix_quotemarks(avvir_doc)

Ávvir has funky quotemarks that seem to be a conversion error from their side.

Source code in /home/anders/projects/CorpusTools/corpustools/avvirconverter.py
172
173
174
175
176
177
178
179
180
181
def fix_quotemarks(avvir_doc):
    """Ávvir has funky quotemarks that seem to be a conversion error from their side."""
    for child in avvir_doc:
        if child.text:
            for (error, replacement) in [("‹‹", "«"), ("››", "»")]:
                child.text = child.text.replace(error, replacement)
        if child.tail:
            for (error, replacement) in [("‹‹", "«"), ("››", "»")]:
                child.tail = child.tail.replace(error, replacement)
        fix_quotemarks(child)

insert_element(para, text, position)

Insert a new element in p's parent.

Parameters:

Name Type Description Default
para lxml.etree.Element

an lxml element, it is a story/p element

required
text str

string

required
position int

the position inside p's parent where the new element is inserted

required

Returns:

Type Description
int

the position

Source code in /home/anders/projects/CorpusTools/corpustools/avvirconverter.py
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
def insert_element(para, text, position):
    """Insert a new element in p's parent.

    Args:
        para (lxml.etree.Element): an lxml element, it is a story/p element
        text (str): string
        position (int): the position inside p's parent where the new
                    element is inserted

    Returns:
        (int): the position
    """
    if text is not None and text.strip() != "":
        new_p = etree.Element("p")
        new_p.text = text
        grandparent = para.getparent()
        grandparent.insert(grandparent.index(para) + position, new_p)
        position += 1

    return position

remove_identical_ids(avvir_doc)

Remove identical ids.

Parameters:

Name Type Description Default
avvir_doc etree.Element

the etree that should be manipulated.

required
Source code in /home/anders/projects/CorpusTools/corpustools/avvirconverter.py
23
24
25
26
27
28
29
30
31
32
33
34
35
def remove_identical_ids(avvir_doc):
    """Remove identical ids.

    Args:
        avvir_doc (etree.Element): the etree that should be manipulated.
    """
    story_ids = set()
    for story in avvir_doc.xpath(".//story[@id]"):
        story_id = story.get("id")
        if story_id not in story_ids:
            story_ids.add(story_id)
        else:
            story.getparent().remove(story)