Make unaccent handle all diacritics known to Unicode, and expand ligatures correctly

author Teodor Sigaev <teodor@sigaev.ru>

Fri, 4 Sep 2015 09:51:53 +0000 (12:51 +0300)

committer Teodor Sigaev <teodor@sigaev.ru>

Fri, 4 Sep 2015 09:51:53 +0000 (12:51 +0300)
author Teodor Sigaev <teodor@sigaev.ru>
Fri, 4 Sep 2015 09:51:53 +0000 (12:51 +0300)
committer Teodor Sigaev <teodor@sigaev.ru>
Fri, 4 Sep 2015 09:51:53 +0000 (12:51 +0300)
diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py

new file mode 100644 (file)

index 0000000..b838d8f
--- /dev/null
+++ b/contrib/unaccent/generate_unaccent_rules.py
@@ -0,0 +1,123 @@
+#!/usr/bin/python
+#
+# This script builds unaccent.rules on standard output when given the
+# contents of UnicodeData.txt[1] on standard input.  Optionally includes
+# ligature expansion, if --expand-ligatures is given on the command line.
+#
+# The approach is to use the Unicode decomposition data to identify
+# precomposed codepoints that are equivalent to a ligature of several
+# letters, or a base letter with any number of diacritical marks.
+# There is also a small set of special cases for codepoints that we
+# traditionally support even though Unicode doesn't consider them to
+# be ligatures or letters with marks.
+#
+# [1] http://unicode.org/Public/7.0.0/ucd/UnicodeData.txt
+
+import re
+import sys
+
+def print_record(codepoint, letter):
+    print (unichr(codepoint) + "\t" + letter).encode("UTF-8")
+
+class Codepoint:
+    def __init__(self, id, general_category, combining_ids):
+        self.id = id
+        self.general_category = general_category
+        self.combining_ids = combining_ids
+
+def is_plain_letter(codepoint):
+    """Return true if codepoint represents a plain ASCII letter."""
+    return (codepoint.id >= ord('a') and codepoint.id <= ord('z')) or \
+           (codepoint.id >= ord('A') and codepoint.id <= ord('Z'))
+
+def is_mark(codepoint):
+    """Returns true for diacritical marks (combining codepoints)."""
+    return codepoint.general_category in ("Mn", "Me", "Mc")
+
+def is_letter_with_marks(codepoint, table):
+    """Returns true for plain letters combined with one or more marks."""
+    # See http://www.unicode.org/reports/tr44/tr44-14.html#General_Category_Values
+    return len(codepoint.combining_ids) > 1 and \
+           is_plain_letter(table[codepoint.combining_ids[0]]) and \
+           all(is_mark(table[i]) for i in codepoint.combining_ids[1:])
+
+def is_letter(codepoint, table):
+    """Return true for letter with or without diacritical marks."""
+    return is_plain_letter(codepoint) or is_letter_with_marks(codepoint, table)
+
+def get_plain_letter(codepoint, table):
+    """Return the base codepoint without marks."""
+    if is_letter_with_marks(codepoint, table):
+        return table[codepoint.combining_ids[0]]
+    elif is_plain_letter(codepoint):
+        return codepoint
+    else:
+        raise "mu"
+
+def is_ligature(codepoint, table):
+    """Return true for letters combined with letters."""
+    return all(is_letter(table[i], table) for i in codepoint.combining_ids)
+
+def get_plain_letters(codepoint, table):
+    """Return a list of plain letters from a ligature."""
+    assert(is_ligature(codepoint, table))
+    return [get_plain_letter(table[id], table) for id in codepoint.combining_ids]
+
+def main(expand_ligatures):
+    # http://www.unicode.org/reports/tr44/tr44-14.html#Character_Decomposition_Mappings
+    decomposition_type_pattern = re.compile(" *<[^>]*> *")
+
+    table = {}
+    all = []
+
+    # read everything we need into memory
+    for line in sys.stdin.readlines():
+        fields = line.split(";")
+        if len(fields) > 5:
+            # http://www.unicode.org/reports/tr44/tr44-14.html#UnicodeData.txt
+            general_category = fields[2]
+            decomposition = fields[5]
+            decomposition = re.sub(decomposition_type_pattern, ' ', decomposition)
+            id = int(fields[0], 16)
+            combining_ids = [int(s, 16) for s in decomposition.split(" ") if s != ""]
+            codepoint = Codepoint(id, general_category, combining_ids)
+            table[id] = codepoint
+            all.append(codepoint)
+
+    # walk through all the codepoints looking for interesting mappings
+    for codepoint in all:
+        if codepoint.general_category.startswith('L') and \
+           len(codepoint.combining_ids) > 1:
+            if is_letter_with_marks(codepoint, table):
+                print_record(codepoint.id,
+                             chr(get_plain_letter(codepoint, table).id))
+            elif expand_ligatures and is_ligature(codepoint, table):
+                print_record(codepoint.id,
+                             "".join(unichr(combining_codepoint.id)
+                                     for combining_codepoint \
+                                     in get_plain_letters(codepoint, table)))
+
+    # some special cases
+    print_record(0x00d8, "O") # LATIN CAPITAL LETTER O WITH STROKE
+    print_record(0x00f8, "o") # LATIN SMALL LETTER O WITH STROKE
+    print_record(0x0110, "D") # LATIN CAPITAL LETTER D WITH STROKE
+    print_record(0x0111, "d") # LATIN SMALL LETTER D WITH STROKE
+    print_record(0x0131, "i") # LATIN SMALL LETTER DOTLESS I
+    print_record(0x0126, "H") # LATIN CAPITAL LETTER H WITH STROKE
+    print_record(0x0127, "h") # LATIN SMALL LETTER H WITH STROKE
+    print_record(0x0141, "L") # LATIN CAPITAL LETTER L WITH STROKE
+    print_record(0x0142, "l") # LATIN SMALL LETTER L WITH STROKE
+    print_record(0x0149, "'n") # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
+    print_record(0x0166, "T") # LATIN CAPITAL LETTER T WITH STROKE
+    print_record(0x0167, "t") # LATIN SMALL LETTER t WITH STROKE
+    print_record(0x0401, u"\u0415") # CYRILLIC CAPITAL LETTER IO
+    print_record(0x0451, u"\u0435") # CYRILLIC SMALL LETTER IO
+    if expand_ligatures:
+        print_record(0x00c6, "AE") # LATIN CAPITAL LETTER AE
+        print_record(0x00df, "ss") # LATIN SMALL LETTER SHARP S
+        print_record(0x00e6, "ae") # LATIN SMALL LETTER AE
+        print_record(0x0152, "OE") # LATIN CAPITAL LIGATURE OE
+        print_record(0x0153, "oe") # LATIN SMALL LIGATURE OE
+
+if __name__ == "__main__":
+    main(len(sys.argv) == 2 and sys.argv[1] == "--expand-ligatures")
diff --git a/contrib/unaccent/unaccent.rules b/contrib/unaccent/unaccent.rules

index cc2f7a65858d7e265850cea4592528c3866ca698..73c24a188badf9dfcbf5ab3950841232014e3c22 100644 (file)
--- a/contrib/unaccent/unaccent.rules
+++ b/contrib/unaccent/unaccent.rules
@@ -4,22 +4,59 @@
  Ã A
  Ä A
  Å A
-Æ A
+Ç C
+È E
+É E
+Ê E
+Ë E
+Ì I
+Í I
+Î I
+Ï I
+Ñ N
+Ò O
+Ó O
+Ô O
+Õ O
+Ö O
+Ù U
+Ú U
+Û U
+Ü U
+Ý Y
  à a
  á a
  â a
  ã a
  ä a
  å a
-æ a
+ç c
+è e
+é e
+ê e
+ë e
+ì i
+í i
+î i
+ï i
+ñ n
+ò o
+ó o
+ô o
+õ o
+ö o
+ù u
+ú u
+û u
+ü u
+ý y
+ÿ y
  Ā A
  ā a
  Ă A
  ă a
  Ą A
  ą a
-Ç C
-ç c
  Ć C
  ć c
  Ĉ C
@@ -30,16 +67,6 @@
  č c
  Ď D
  ď d
-Đ D
-đ d
-È E
-É E
-Ê E
-Ë E
-è e
-é e
-ê e
-ë e
  Ē E
  ē e
  Ĕ E
@@ -60,17 +87,7 @@
  ģ g
  Ĥ H
  ĥ h
-Ħ H
-ħ h
  Ĩ I
-Ì I
-Í I
-Î I
-Ï I
-ì i
-í i
-î i
-ï i
  ĩ i
  Ī I
  ī i
@@ -79,62 +96,36 @@
  Į I
  į i
  İ I
-ı i
-Ĳ I
-ĳ i
+Ĳ IJ
+ĳ ij
  Ĵ J
  ĵ j
  Ķ K
  ķ k
-ĸ k
  Ĺ L
  ĺ l
  Ļ L
  ļ l
  Ľ L
  ľ l
-Ŀ L
-ŀ l
-Ł L
-ł l
-Ñ N
-ñ n
  Ń N
  ń n
  Ņ N
  ņ n
  Ň N
  ň n
-ŉ n
-Ŋ N
-ŋ n
-Ò O
-Ó O
-Ô O
-Õ O
-Ö O
-ò o
-ó o
-ô o
-õ o
-ö o
  Ō O
  ō o
  Ŏ O
  ŏ o
  Ő O
  ő o
-Œ E
-œ e
-Ø O
-ø o
  Ŕ R
  ŕ r
  Ŗ R
  ŗ r
  Ř R
  ř r
-ß S
  Ś S
  ś s
  Ŝ S
@@ -147,16 +138,6 @@
  ţ t
  Ť T
  ť t
-Ŧ T
-ŧ t
-Ù U
-Ú U
-Û U
-Ü U
-ù u
-ú u
-û u
-ü u
  Ũ U
  ũ u
  Ū U
@@ -171,9 +152,6 @@
  ų u
  Ŵ W
  ŵ w
-Ý Y
-ý y
-ÿ y
  Ŷ Y
  ŷ y
  Ÿ Y
@@ -183,5 +161,253 @@
  ż z
  Ž Z
  ž z
-ё е
+Ơ O
+ơ o
+Ư U
+ư u
+Ǆ DZ
+ǅ Dz
+ǆ dz
+Ǉ LJ
+ǈ Lj
+ǉ lj
+Ǌ NJ
+ǋ Nj
+ǌ nj
+Ǎ A
+ǎ a
+Ǐ I
+ǐ i
+Ǒ O
+ǒ o
+Ǔ U
+ǔ u
+Ǧ G
+ǧ g
+Ǩ K
+ǩ k
+Ǫ O
+ǫ o
+ǰ j
+Ǳ DZ
+ǲ Dz
+ǳ dz
+Ǵ G
+ǵ g
+Ǹ N
+ǹ n
+Ȁ A
+ȁ a
+Ȃ A
+ȃ a
+Ȅ E
+ȅ e
+Ȇ E
+ȇ e
+Ȉ I
+ȉ i
+Ȋ I
+ȋ i
+Ȍ O
+ȍ o
+Ȏ O
+ȏ o
+Ȑ R
+ȑ r
+Ȓ R
+ȓ r
+Ȕ U
+ȕ u
+Ȗ U
+ȗ u
+Ș S
+ș s
+Ț T
+ț t
+Ȟ H
+ȟ h
+Ȧ A
+ȧ a
+Ȩ E
+ȩ e
+Ȯ O
+ȯ o
+Ȳ Y
+ȳ y
+Ḁ    A
+ḁ    a
+Ḃ    B
+ḃ    b
+Ḅ    B
+ḅ    b
+Ḇ    B
+ḇ    b
+Ḋ    D
+ḋ    d
+Ḍ    D
+ḍ    d
+Ḏ    D
+ḏ    d
+Ḑ    D
+ḑ    d
+Ḓ    D
+ḓ    d
+Ḙ    E
+ḙ    e
+Ḛ    E
+ḛ    e
+Ḟ    F
+ḟ    f
+Ḡ    G
+ḡ    g
+Ḣ    H
+ḣ    h
+Ḥ    H
+ḥ    h
+Ḧ    H
+ḧ    h
+Ḩ    H
+ḩ    h
+Ḫ    H
+ḫ    h
+Ḭ    I
+ḭ    i
+Ḱ    K
+ḱ    k
+Ḳ    K
+ḳ    k
+Ḵ    K
+ḵ    k
+Ḷ    L
+ḷ    l
+Ḻ    L
+ḻ    l
+Ḽ    L
+ḽ    l
+Ḿ    M
+ḿ    m
+Ṁ    M
+ṁ    m
+Ṃ    M
+ṃ    m
+Ṅ    N
+ṅ    n
+Ṇ    N
+ṇ    n
+Ṉ    N
+ṉ    n
+Ṋ    N
+ṋ    n
+Ṕ    P
+ṕ    p
+Ṗ    P
+ṗ    p
+Ṙ    R
+ṙ    r
+Ṛ    R
+ṛ    r
+Ṟ    R
+ṟ    r
+Ṡ    S
+ṡ    s
+Ṣ    S
+ṣ    s
+Ṫ    T
+ṫ    t
+Ṭ    T
+ṭ    t
+Ṯ    T
+ṯ    t
+Ṱ    T
+ṱ    t
+Ṳ    U
+ṳ    u
+Ṵ    U
+ṵ    u
+Ṷ    U
+ṷ    u
+Ṽ    V
+ṽ    v
+Ṿ    V
+ṿ    v
+Ẁ    W
+ẁ    w
+Ẃ    W
+ẃ    w
+Ẅ    W
+ẅ    w
+Ẇ    W
+ẇ    w
+Ẉ    W
+ẉ    w
+Ẋ    X
+ẋ    x
+Ẍ    X
+ẍ    x
+Ẏ    Y
+ẏ    y
+Ẑ    Z
+ẑ    z
+Ẓ    Z
+ẓ    z
+Ẕ    Z
+ẕ    z
+ẖ    h
+ẗ    t
+ẘ    w
+ẙ    y
+Ạ    A
+ạ    a
+Ả    A
+ả    a
+Ẹ    E
+ẹ    e
+Ẻ    E
+ẻ    e
+Ẽ    E
+ẽ    e
+Ỉ    I
+ỉ    i
+Ị    I
+ị    i
+Ọ    O
+ọ    o
+Ỏ    O
+ỏ    o
+Ụ    U
+ụ    u
+Ủ    U
+ủ    u
+Ỳ    Y
+ỳ    y
+Ỵ    Y
+ỵ    y
+Ỷ    Y
+ỷ    y
+Ỹ    Y
+ỹ    y
+ﬀ    ff
+ﬁ    fi
+ﬂ    fl
+ﬃ    ffi
+ﬄ    ffl
+ﬆ    st
+Ø O
+ø o
+Đ D
+đ d
+ı i
+Ħ H
+ħ h
+Ł L
+ł l
+ŉ 'n
+Ŧ T
+ŧ t
  Ё Е
+ё е
+Æ AE
+ß ss
+æ ae
+Œ OE
+œ oe
author	Teodor Sigaev <teodor@sigaev.ru>
	Fri, 4 Sep 2015 09:51:53 +0000 (12:51 +0300)
committer	Teodor Sigaev <teodor@sigaev.ru>
	Fri, 4 Sep 2015 09:51:53 +0000 (12:51 +0300)
contrib/unaccent/generate_unaccent_rules.py	[new file with mode: 0644]	patch \| blob
contrib/unaccent/unaccent.rules		patch \| blob \| blame \| history