Skip to content

decode

Code to detect and fix semi official and unofficial encodings.

(Northern) sami character eight bit encodings have been semi or non official standards and have been converted to the various systems' internal encodings. This module has functions that revert the damage done.

decode_para(position, text)

Decode the text given to this function.

Replace letters in text with the ones from the dict at position position in CTYPES

Parameters:

Name Type Description Default
position str

an encoding name

required
text str

the text to decode

required

Returns:

Type Description
str

The decoded text

Source code in /home/anders/projects/CorpusTools/corpustools/decode.py
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
def decode_para(position, text):
    """Decode the text given to this function.

    Replace letters in text with the ones from the dict at
    position position in CTYPES

    Args:
        position (str): an encoding name
        text (str): the text to decode

    Returns:
        (str): The decoded text
    """
    which_decoder = {
        "mac-sami_to_cp1252": fix_macsami_cp1252,
        "mac-sami_to_latin1": fix_macsami_latin1,
        "mac-sami_to_mac": fix_macsami_mac,
        "winsami2_to_cp1252": fix_winsami2_cp1252,
        "cp1251_cp1252": fix_meadowmari_cp1252,
    }

    try:
        return which_decoder[position](text)
    except KeyError:
        return default_decoder(position, text)

default_decoder(position, text)

The default decoder.

Parameters:

Name Type Description Default
position str required
text str

The string that should be decoded.

required

Returns:

Type Description
str
Source code in /home/anders/projects/CorpusTools/corpustools/decode.py
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
def default_decoder(position, text):
    """The default decoder.

    Args:
        position (str): 
        text (str): The string that should be decoded.

    Returns:
        (str): 
    """
    if position is not None:
        for key, value in CTYPES[position].items():
            text = text.replace(key, value)

    return text

fix_macsami_cp1252(instring)

Fix instring.

Parameters:

Name Type Description Default
instring str

A bytestring that originally was encoded as macsami but has been decoded to unicode as if it was cp1252.

required

Returns:

Type Description
str

str with fixed encoding.

Source code in /home/anders/projects/CorpusTools/corpustools/decode.py
36
37
38
39
40
41
42
43
44
45
46
47
48
49
def fix_macsami_cp1252(instring):
    """Fix instring.

    Args:
        instring (str): A bytestring that originally was encoded as
            macsami but has been decoded to unicode as if it was
            cp1252.

    Returns:
        (str): str with fixed encoding.
    """
    bytestring = instring.encode("1252", errors="xmlcharrefreplace")
    encoded_unicode = bytestring.decode("macsami").replace("", "Å")
    return encoded_unicode

fix_macsami_latin1(instring)

Fix instring.

Parameters:

Name Type Description Default
instring str

A bytestring that originally was encoded as macsami but has been decoded to unicode as if it was latin1.

required

Returns:

Type Description
str

a string with fixed encoding.

Source code in /home/anders/projects/CorpusTools/corpustools/decode.py
52
53
54
55
56
57
58
59
60
61
62
63
def fix_macsami_latin1(instring):
    """Fix instring.

    Args:
        instring (str): A bytestring that originally was encoded as
            macsami but has been decoded to unicode as if it was
            latin1.

    Returns:
        (str): a string with fixed encoding.
    """
    return instring.encode("latin1", errors="xmlcharrefreplace").decode("macsami")

fix_macsami_mac(instring)

Fix instring.

Parameters:

Name Type Description Default
instring str

A bytestring that originally was encoded as macsami but has been decoded to unicode as if it was macroman.

required

Returns:

Type Description
str

a string with fixed encoding.

Source code in /home/anders/projects/CorpusTools/corpustools/decode.py
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
def fix_macsami_mac(instring):
    """Fix instring.

    Args:
        instring (str): A bytestring that originally was encoded as
            macsami but has been decoded to unicode as if it was
            macroman.

    Returns:
        (str): a string with fixed encoding.
    """
    bytestring = instring.encode("macroman", "xmlcharrefreplace")
    encoded_string = bytestring.decode("macsami").replace("Ω", "ž")

    return encoded_string

fix_meadowmari_cp1252(instring)

Fix instring.

Parameters:

Name Type Description Default
instring str

A bytestring that originally was encoded as meadowmari but has been decoded to unicode as if it was cp1252.

required

Returns:

Type Description
str

a string with fixed encoding.

Source code in /home/anders/projects/CorpusTools/corpustools/decode.py
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
def fix_meadowmari_cp1252(instring):
    """Fix instring.

    Args:
        instring (str): A bytestring that originally was encoded as
            meadowmari but has been decoded to unicode as if it was
            cp1252.

    Returns:
        (str): a string with fixed encoding.
    """
    mari_replacements = [
        ("ў", "ӱ"),  # xml char ref CYRILLIC SMALL LETTER SHORT U
        ("Ў", "Ӱ"),  # xml char ref CYRILLIC CAPITAL LETTER SHORT U
        ("Ў", "Ӱ"),
        ("є", "ӧ"),  # xml char ref CYRILLIC SMALL LETTER UKRAINIAN IE
        ("Є", "Ӧ"),  # xml char ref CYRILLIC CAPITAL LETTER UKRAINIAN IE
    ]

    return util.replace_all(
        mari_replacements,
        instring.encode("cp1252", errors="xmlcharrefreplace").decode("meadowmari"),
    )

fix_winsami2_cp1252(instring)

Fix instring.

Parameters:

Name Type Description Default
instring str

A bytestring that originally was encoded as winsami2 but has been decoded to unicode as if it was cp1252.

required

Returns:

Type Description
str

a string with fixed encoding.

Source code in /home/anders/projects/CorpusTools/corpustools/decode.py
83
84
85
86
87
88
89
90
91
92
93
94
def fix_winsami2_cp1252(instring):
    """Fix instring.

    Args:
        instring (str): A bytestring that originally was encoded as
            winsami2 but has been decoded to unicode as if it was
            cp1252.

    Returns:
        (str): a string with fixed encoding.
    """
    return instring.encode("cp1252", errors="xmlcharrefreplace").decode("ws2")

guess_body_encoding(content, mainlang)

Guess the encoding of the string content.

First get the frequencies of the "sami letters" Then get the frequencies of the letters in the encodings in CTYPES

If "sami letters" that the encoding tries to fix exist in "content", disregard the encoding

Parameters:

Name Type Description Default
content str

the content

required
mainlang str

Three-letter language code

required

Returns:

Type Description
str

A codec name, as given in the keys of CTYPES, or None if no codec could be determined

Source code in /home/anders/projects/CorpusTools/corpustools/decode.py
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
def guess_body_encoding(content, mainlang):
    """Guess the encoding of the string content.

    First get the frequencies of the "sami letters"
    Then get the frequencies of the letters in the encodings in CTYPES

    If "sami letters" that the encoding tries to fix exist in "content",
    disregard the encoding

    Args:
        content (str): the content
        mainlang (str): Three-letter language code

    Returns:
        (str): A codec name, as given in the keys of CTYPES, or None
            if no codec could be determined
    """
    winner = None
    if "ì" in content and "ò" in content and mainlang in CYRILLIC_LANGUAGES:
        winner = "cyrillic_in_pdf"
    elif "à" in content and "û" in content and mainlang in CYRILLIC_LANGUAGES:
        winner = "cp1251_cp1252"
    elif ("‡" in content and "ã" not in content) or (
        "Œ" in content and "ÄŒ" not in content and "å" not in content
    ):
        winner = "mac-sami_to_cp1252"
    elif (
        ("‡" in content and "ã" not in content)
        or ("Œ" in content)
        or ("¯" in content and "á" not in content)
    ):
        winner = "mac-sami_to_latin1"
    elif "‡" in content and "ã":
        winner = "mix-mac-sami-and-some-unknown-encoding"
    elif "³" in content and "¢" in content and "¤" in content:
        winner = "iso-ir-197_to_cp1252"
    elif "á" in content and ("ª" in content or "∫" in content):
        winner = "mac-sami_to_mac"
    elif "ó" in content and "ç" in content and "ð" in content:
        winner = "winsam_to_cp1252"
    elif "á" in content and "è" in content and "ð" in content:
        winner = "latin4_to_cp1252"
    elif "ó" in content and "ç" in content and "¤" in content:
        winner = "mix-of-latin4-and-iso-ir-197_to_cp1252"
    elif "„" in content and ("˜" in content or "¹" in content):
        winner = "winsami2_to_cp1252"
    elif "þ" in content and "š" in content and "á" in content:
        winner = "finnish-lawtexts-in-pdf"
    elif "á" in content:
        winner = "double-utf8"

    return winner

guess_file_encoding(filename, mainlang)

Guess the encoding of a file.

Parameters:

Name Type Description Default
filename str

the file to open

required

Returns:

Type Description
str

A codec name, as given in the keys of CTYPES, or None if no codec could be determined

Source code in /home/anders/projects/CorpusTools/corpustools/decode.py
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
def guess_file_encoding(filename, mainlang):
    """Guess the encoding of a file.

    Args:
        filename (str): the file to open

    Returns:
        (str): A codec name, as given in the keys of CTYPES, or None
            if no codec could be determined
    """
    with open(filename) as infile:
        content = infile.read()
        winner = guess_body_encoding(content, mainlang)

        return winner