Make unaccent handle all diacritics known to Unicode, and expand ligatures correctly
authorTeodor Sigaev <teodor@sigaev.ru>
Fri, 4 Sep 2015 09:51:53 +0000 (12:51 +0300)
committerTeodor Sigaev <teodor@sigaev.ru>
Fri, 4 Sep 2015 09:51:53 +0000 (12:51 +0300)
Add Python script for buiding unaccent.rules from Unicode data. Don't
backpatch because unaccent changes may require tsvector/index
rebuild.

Thomas Munro <thomas.munro@enterprisedb.com>

contrib/unaccent/generate_unaccent_rules.py [new file with mode: 0644]
contrib/unaccent/unaccent.rules

diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py
new file mode 100644 (file)
index 0000000..b838d8f
--- /dev/null
@@ -0,0 +1,123 @@
+#!/usr/bin/python
+#
+# This script builds unaccent.rules on standard output when given the
+# contents of UnicodeData.txt[1] on standard input.  Optionally includes
+# ligature expansion, if --expand-ligatures is given on the command line.
+#
+# The approach is to use the Unicode decomposition data to identify
+# precomposed codepoints that are equivalent to a ligature of several
+# letters, or a base letter with any number of diacritical marks.
+# There is also a small set of special cases for codepoints that we
+# traditionally support even though Unicode doesn't consider them to
+# be ligatures or letters with marks.
+#
+# [1] http://unicode.org/Public/7.0.0/ucd/UnicodeData.txt
+
+import re
+import sys
+
+def print_record(codepoint, letter):
+    print (unichr(codepoint) + "\t" + letter).encode("UTF-8")
+
+class Codepoint:
+    def __init__(self, id, general_category, combining_ids):
+        self.id = id
+        self.general_category = general_category
+        self.combining_ids = combining_ids
+
+def is_plain_letter(codepoint):
+    """Return true if codepoint represents a plain ASCII letter."""
+    return (codepoint.id >= ord('a') and codepoint.id <= ord('z')) or \
+           (codepoint.id >= ord('A') and codepoint.id <= ord('Z'))
+
+def is_mark(codepoint):
+    """Returns true for diacritical marks (combining codepoints)."""
+    return codepoint.general_category in ("Mn", "Me", "Mc")
+
+def is_letter_with_marks(codepoint, table):
+    """Returns true for plain letters combined with one or more marks."""
+    # See http://www.unicode.org/reports/tr44/tr44-14.html#General_Category_Values
+    return len(codepoint.combining_ids) > 1 and \
+           is_plain_letter(table[codepoint.combining_ids[0]]) and \
+           all(is_mark(table[i]) for i in codepoint.combining_ids[1:])
+
+def is_letter(codepoint, table):
+    """Return true for letter with or without diacritical marks."""
+    return is_plain_letter(codepoint) or is_letter_with_marks(codepoint, table)
+
+def get_plain_letter(codepoint, table):
+    """Return the base codepoint without marks."""
+    if is_letter_with_marks(codepoint, table):
+        return table[codepoint.combining_ids[0]]
+    elif is_plain_letter(codepoint):
+        return codepoint
+    else:
+        raise "mu"
+
+def is_ligature(codepoint, table):
+    """Return true for letters combined with letters."""
+    return all(is_letter(table[i], table) for i in codepoint.combining_ids)
+
+def get_plain_letters(codepoint, table):
+    """Return a list of plain letters from a ligature."""
+    assert(is_ligature(codepoint, table))
+    return [get_plain_letter(table[id], table) for id in codepoint.combining_ids]
+
+def main(expand_ligatures):
+    # http://www.unicode.org/reports/tr44/tr44-14.html#Character_Decomposition_Mappings
+    decomposition_type_pattern = re.compile(" *<[^>]*> *")
+
+    table = {}
+    all = []
+
+    # read everything we need into memory
+    for line in sys.stdin.readlines():
+        fields = line.split(";")
+        if len(fields) > 5:
+            # http://www.unicode.org/reports/tr44/tr44-14.html#UnicodeData.txt
+            general_category = fields[2]
+            decomposition = fields[5]
+            decomposition = re.sub(decomposition_type_pattern, ' ', decomposition)
+            id = int(fields[0], 16)
+            combining_ids = [int(s, 16) for s in decomposition.split(" ") if s != ""]
+            codepoint = Codepoint(id, general_category, combining_ids)
+            table[id] = codepoint
+            all.append(codepoint)
+
+    # walk through all the codepoints looking for interesting mappings
+    for codepoint in all:
+        if codepoint.general_category.startswith('L') and \
+           len(codepoint.combining_ids) > 1:
+            if is_letter_with_marks(codepoint, table):
+                print_record(codepoint.id,
+                             chr(get_plain_letter(codepoint, table).id))
+            elif expand_ligatures and is_ligature(codepoint, table):
+                print_record(codepoint.id,
+                             "".join(unichr(combining_codepoint.id)
+                                     for combining_codepoint \
+                                     in get_plain_letters(codepoint, table)))
+
+    # some special cases
+    print_record(0x00d8, "O") # LATIN CAPITAL LETTER O WITH STROKE
+    print_record(0x00f8, "o") # LATIN SMALL LETTER O WITH STROKE
+    print_record(0x0110, "D") # LATIN CAPITAL LETTER D WITH STROKE
+    print_record(0x0111, "d") # LATIN SMALL LETTER D WITH STROKE
+    print_record(0x0131, "i") # LATIN SMALL LETTER DOTLESS I
+    print_record(0x0126, "H") # LATIN CAPITAL LETTER H WITH STROKE
+    print_record(0x0127, "h") # LATIN SMALL LETTER H WITH STROKE
+    print_record(0x0141, "L") # LATIN CAPITAL LETTER L WITH STROKE
+    print_record(0x0142, "l") # LATIN SMALL LETTER L WITH STROKE
+    print_record(0x0149, "'n") # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
+    print_record(0x0166, "T") # LATIN CAPITAL LETTER T WITH STROKE
+    print_record(0x0167, "t") # LATIN SMALL LETTER t WITH STROKE
+    print_record(0x0401, u"\u0415") # CYRILLIC CAPITAL LETTER IO
+    print_record(0x0451, u"\u0435") # CYRILLIC SMALL LETTER IO
+    if expand_ligatures:
+        print_record(0x00c6, "AE") # LATIN CAPITAL LETTER AE
+        print_record(0x00df, "ss") # LATIN SMALL LETTER SHARP S
+        print_record(0x00e6, "ae") # LATIN SMALL LETTER AE
+        print_record(0x0152, "OE") # LATIN CAPITAL LIGATURE OE
+        print_record(0x0153, "oe") # LATIN SMALL LIGATURE OE
+
+if __name__ == "__main__":
+    main(len(sys.argv) == 2 and sys.argv[1] == "--expand-ligatures")
index cc2f7a65858d7e265850cea4592528c3866ca698..73c24a188badf9dfcbf5ab3950841232014e3c22 100644 (file)
@@ -4,22 +4,59 @@
 Ã A
 Ä A
 Å A
-Æ A
+Ç C
+È E
+É E
+Ê E
+Ë E
+Ì I
+Í I
+ΠI
+Ï I
+Ñ N
+Ò O
+Ó O
+Ô O
+Õ O
+Ö O
+Ù U
+Ú U
+Û U
+Ü U
+Ý Y
 à a
 á a
 â a
 ã a
 ä a
 å a
-æ a
+ç c
+è e
+é e
+ê e
+ë e
+ì i
+í i
+î i
+ï i
+ñ n
+ò o
+ó o
+ô o
+õ o
+ö o
+ù u
+ú u
+û u
+ü u
+ý y
+ÿ y
 Ā A
 ā a
 Ă A
 ă a
 Ą A
 ą a
-Ç C
-ç c
 Ć C
 ć c
 Ĉ C
 č c
 Ď D
 ď d
-Đ D
-đ d
-È E
-É E
-Ê E
-Ë E
-è e
-é e
-ê e
-ë e
 Ē E
 ē e
 Ĕ E
 ģ g
 Ĥ H
 ĥ h
-Ħ H
-ħ h
 Ĩ I
-Ì I
-Í I
-ΠI
-Ï I
-ì i
-í i
-î i
-ï i
 ĩ i
 Ī I
 ī i
 Į I
 į i
 İ I
-ı i
-IJ I
-ij i
+IJ IJ
+ij ij
 Ĵ J
 ĵ j
 Ķ K
 ķ k
-ĸ k
 Ĺ L
 ĺ l
 Ļ L
 ļ l
 Ľ L
 ľ l
-Ŀ L
-ŀ l
-Ł L
-ł l
-Ñ N
-ñ n
 Ń N
 ń n
 Ņ N
 ņ n
 Ň N
 ň n
-ʼn n
-Ŋ N
-ŋ n
-Ò O
-Ó O
-Ô O
-Õ O
-Ö O
-ò o
-ó o
-ô o
-õ o
-ö o
 Ō O
 ō o
 Ŏ O
 ŏ o
 Ő O
 ő o
-Œ E
-œ e
-Ø O
-ø o
 Ŕ R
 ŕ r
 Ŗ R
 ŗ r
 Ř R
 ř r
-ß S
 Ś S
 ś s
 Ŝ S
 ţ t
 Ť T
 ť t
-Ŧ T
-ŧ t
-Ù U
-Ú U
-Û U
-Ü U
-ù u
-ú u
-û u
-ü u
 Ũ U
 ũ u
 Ū U
 ų u
 Ŵ W
 ŵ w
-Ý Y
-ý y
-ÿ y
 Ŷ Y
 ŷ y
 Ÿ Y
 ż z
 Ž Z
 ž z
-ё е
+Ơ O
+ơ o
+Ư U
+ư u
+DŽ DZ
+Dž Dz
+dž dz
+LJ LJ
+Lj Lj
+lj lj
+NJ NJ
+Nj Nj
+nj nj
+Ǎ A
+ǎ a
+Ǐ I
+ǐ i
+Ǒ O
+ǒ o
+Ǔ U
+ǔ u
+Ǧ G
+ǧ g
+Ǩ K
+ǩ k
+Ǫ O
+ǫ o
+ǰ j
+DZ DZ
+Dz Dz
+dz dz
+Ǵ G
+ǵ g
+Ǹ N
+ǹ n
+Ȁ A
+ȁ a
+Ȃ A
+ȃ a
+Ȅ E
+ȅ e
+Ȇ E
+ȇ e
+Ȉ I
+ȉ i
+Ȋ I
+ȋ i
+Ȍ O
+ȍ o
+Ȏ O
+ȏ o
+Ȑ R
+ȑ r
+Ȓ R
+ȓ r
+Ȕ U
+ȕ u
+Ȗ U
+ȗ u
+Ș S
+ș s
+Ț T
+ț t
+Ȟ H
+ȟ h
+Ȧ A
+ȧ a
+Ȩ E
+ȩ e
+Ȯ O
+ȯ o
+Ȳ Y
+ȳ y
+Ḁ    A
+ḁ    a
+Ḃ    B
+ḃ    b
+Ḅ    B
+ḅ    b
+Ḇ    B
+ḇ    b
+Ḋ    D
+ḋ    d
+Ḍ    D
+ḍ    d
+Ḏ    D
+ḏ    d
+Ḑ    D
+ḑ    d
+Ḓ    D
+ḓ    d
+Ḙ    E
+ḙ    e
+Ḛ    E
+ḛ    e
+Ḟ    F
+ḟ    f
+Ḡ    G
+ḡ    g
+Ḣ    H
+ḣ    h
+Ḥ    H
+ḥ    h
+Ḧ    H
+ḧ    h
+Ḩ    H
+ḩ    h
+Ḫ    H
+ḫ    h
+Ḭ    I
+ḭ    i
+Ḱ    K
+ḱ    k
+Ḳ    K
+ḳ    k
+Ḵ    K
+ḵ    k
+Ḷ    L
+ḷ    l
+Ḻ    L
+ḻ    l
+Ḽ    L
+ḽ    l
+Ḿ    M
+ḿ    m
+Ṁ    M
+ṁ    m
+Ṃ    M
+ṃ    m
+Ṅ    N
+ṅ    n
+Ṇ    N
+ṇ    n
+Ṉ    N
+ṉ    n
+Ṋ    N
+ṋ    n
+Ṕ    P
+ṕ    p
+Ṗ    P
+ṗ    p
+Ṙ    R
+ṙ    r
+Ṛ    R
+ṛ    r
+Ṟ    R
+ṟ    r
+Ṡ    S
+ṡ    s
+Ṣ    S
+ṣ    s
+Ṫ    T
+ṫ    t
+Ṭ    T
+ṭ    t
+Ṯ    T
+ṯ    t
+Ṱ    T
+ṱ    t
+Ṳ    U
+ṳ    u
+Ṵ    U
+ṵ    u
+Ṷ    U
+ṷ    u
+Ṽ    V
+ṽ    v
+Ṿ    V
+ṿ    v
+Ẁ    W
+ẁ    w
+Ẃ    W
+ẃ    w
+Ẅ    W
+ẅ    w
+Ẇ    W
+ẇ    w
+Ẉ    W
+ẉ    w
+Ẋ    X
+ẋ    x
+Ẍ    X
+ẍ    x
+Ẏ    Y
+ẏ    y
+Ẑ    Z
+ẑ    z
+Ẓ    Z
+ẓ    z
+Ẕ    Z
+ẕ    z
+ẖ    h
+ẗ    t
+ẘ    w
+ẙ    y
+Ạ    A
+ạ    a
+Ả    A
+ả    a
+Ẹ    E
+ẹ    e
+Ẻ    E
+ẻ    e
+Ẽ    E
+ẽ    e
+Ỉ    I
+ỉ    i
+Ị    I
+ị    i
+Ọ    O
+ọ    o
+Ỏ    O
+ỏ    o
+Ụ    U
+ụ    u
+Ủ    U
+ủ    u
+Ỳ    Y
+ỳ    y
+Ỵ    Y
+ỵ    y
+Ỷ    Y
+ỷ    y
+Ỹ    Y
+ỹ    y
+ff    ff
+fi    fi
+fl    fl
+ffi    ffi
+ffl    ffl
+st    st
+Ø O
+ø o
+Đ D
+đ d
+ı i
+Ħ H
+ħ h
+Ł L
+ł l
+ʼn 'n
+Ŧ T
+ŧ t
 Ё Е
+ё е
+Æ AE
+ß ss
+æ ae
+Œ OE
+œ oe