Add Greek characters to unaccent.rules.

author Thomas Munro <tmunro@postgresql.org>

Sat, 1 Sep 2018 19:12:24 +0000 (07:12 +1200)

committer Thomas Munro <tmunro@postgresql.org>

Sat, 1 Sep 2018 19:12:24 +0000 (07:12 +1200)
author Thomas Munro <tmunro@postgresql.org>
Sat, 1 Sep 2018 19:12:24 +0000 (07:12 +1200)
committer Thomas Munro <tmunro@postgresql.org>
Sat, 1 Sep 2018 19:12:24 +0000 (07:12 +1200)
diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py

index 4b1b011861f1daeedefaa61cddb87656598b312a..859cac40fa1cdd466061ac1dbd5e5fc698f7c19a 100644 (file)
--- a/contrib/unaccent/generate_unaccent_rules.py
+++ b/contrib/unaccent/generate_unaccent_rules.py
@@ -29,6 +29,15 @@ import argparse
  import sys
  import xml.etree.ElementTree as ET
  
+# The ranges of Unicode characters that we consider to be "plain letters".
+# For now we are being conservative by including only Latin and Greek.  This
+# could be extended in future based on feedback from people with relevant
+# language knowledge.
+PLAIN_LETTER_RANGES = ((ord('a'), ord('z')), # Latin lower case
+                       (ord('A'), ord('Z')), # Latin upper case
+                       (0x03b1, 0x03c9),     # GREEK SMALL LETTER ALPHA, GREEK SMALL LETTER OMEGA
+                       (0x0391, 0x03a9))     # GREEK CAPITAL LETTER ALPHA, GREEK CAPITAL LETTER OMEGA
+
  def print_record(codepoint, letter):
      print (unichr(codepoint) + "\t" + letter).encode("UTF-8")
  
@@ -39,9 +48,11 @@ class Codepoint:
          self.combining_ids = combining_ids
  
  def is_plain_letter(codepoint):
-    """Return true if codepoint represents a plain ASCII letter."""
-    return (codepoint.id >= ord('a') and codepoint.id <= ord('z')) or \
-           (codepoint.id >= ord('A') and codepoint.id <= ord('Z'))
+    """Return true if codepoint represents a "plain letter"."""
+    for begin, end in PLAIN_LETTER_RANGES:
+      if codepoint.id >= begin and codepoint.id <= end:
+        return True
+    return False
  
  def is_mark(codepoint):
      """Returns true for diacritical marks (combining codepoints)."""
@@ -184,7 +195,7 @@ def main(args):
             len(codepoint.combining_ids) > 1:
              if is_letter_with_marks(codepoint, table):
                  charactersSet.add((codepoint.id,
-                             chr(get_plain_letter(codepoint, table).id)))
+                             unichr(get_plain_letter(codepoint, table).id)))
              elif args.noLigaturesExpansion is False and is_ligature(codepoint, table):
                  charactersSet.add((codepoint.id,
                               "".join(unichr(combining_codepoint.id)
diff --git a/contrib/unaccent/unaccent.rules b/contrib/unaccent/unaccent.rules

index 97f9ed47cfa8585b470b1b586bc7736545456f4a..76e4e69bebb8c8d4c053a81a1114a773275804a5 100644 (file)
--- a/contrib/unaccent/unaccent.rules
+++ b/contrib/unaccent/unaccent.rules
@@ -399,6 +399,26 @@
  ʦ ts
  ʪ ls
  ʫ lz
+Ά Α
+Έ Ε
+Ή Η
+Ί Ι
+Ό Ο
+Ύ Υ
+Ώ Ω
+ΐ ι
+Ϊ Ι
+Ϋ Υ
+ά α
+έ ε
+ή η
+ί ι
+ΰ υ
+ϊ ι
+ϋ υ
+ό ο
+ύ υ
+ώ ω
  Ё Е
  ё е
  ᴀ    A
@@ -709,6 +729,207 @@
  ỽ    v
  Ỿ    Y
  ỿ    y
+ἀ    α
+ἁ    α
+ἂ    α
+ἃ    α
+ἄ    α
+ἅ    α
+ἆ    α
+ἇ    α
+Ἀ    Α
+Ἁ    Α
+Ἂ    Α
+Ἃ    Α
+Ἄ    Α
+Ἅ    Α
+Ἆ    Α
+Ἇ    Α
+ἐ    ε
+ἑ    ε
+ἒ    ε
+ἓ    ε
+ἔ    ε
+ἕ    ε
+Ἐ    Ε
+Ἑ    Ε
+Ἒ    Ε
+Ἓ    Ε
+Ἔ    Ε
+Ἕ    Ε
+ἠ    η
+ἡ    η
+ἢ    η
+ἣ    η
+ἤ    η
+ἥ    η
+ἦ    η
+ἧ    η
+Ἠ    Η
+Ἡ    Η
+Ἢ    Η
+Ἣ    Η
+Ἤ    Η
+Ἥ    Η
+Ἦ    Η
+Ἧ    Η
+ἰ    ι
+ἱ    ι
+ἲ    ι
+ἳ    ι
+ἴ    ι
+ἵ    ι
+ἶ    ι
+ἷ    ι
+Ἰ    Ι
+Ἱ    Ι
+Ἲ    Ι
+Ἳ    Ι
+Ἴ    Ι
+Ἵ    Ι
+Ἶ    Ι
+Ἷ    Ι
+ὀ    ο
+ὁ    ο
+ὂ    ο
+ὃ    ο
+ὄ    ο
+ὅ    ο
+Ὀ    Ο
+Ὁ    Ο
+Ὂ    Ο
+Ὃ    Ο
+Ὄ    Ο
+Ὅ    Ο
+ὐ    υ
+ὑ    υ
+ὒ    υ
+ὓ    υ
+ὔ    υ
+ὕ    υ
+ὖ    υ
+ὗ    υ
+Ὑ    Υ
+Ὓ    Υ
+Ὕ    Υ
+Ὗ    Υ
+ὠ    ω
+ὡ    ω
+ὢ    ω
+ὣ    ω
+ὤ    ω
+ὥ    ω
+ὦ    ω
+ὧ    ω
+Ὠ    Ω
+Ὡ    Ω
+Ὢ    Ω
+Ὣ    Ω
+Ὤ    Ω
+Ὥ    Ω
+Ὦ    Ω
+Ὧ    Ω
+ὰ    α
+ὲ    ε
+ὴ    η
+ὶ    ι
+ὸ    ο
+ὺ    υ
+ὼ    ω
+ᾀ    α
+ᾁ    α
+ᾂ    α
+ᾃ    α
+ᾄ    α
+ᾅ    α
+ᾆ    α
+ᾇ    α
+ᾈ    Α
+ᾉ    Α
+ᾊ    Α
+ᾋ    Α
+ᾌ    Α
+ᾍ    Α
+ᾎ    Α
+ᾏ    Α
+ᾐ    η
+ᾑ    η
+ᾒ    η
+ᾓ    η
+ᾔ    η
+ᾕ    η
+ᾖ    η
+ᾗ    η
+ᾘ    Η
+ᾙ    Η
+ᾚ    Η
+ᾛ    Η
+ᾜ    Η
+ᾝ    Η
+ᾞ    Η
+ᾟ    Η
+ᾠ    ω
+ᾡ    ω
+ᾢ    ω
+ᾣ    ω
+ᾤ    ω
+ᾥ    ω
+ᾦ    ω
+ᾧ    ω
+ᾨ    Ω
+ᾩ    Ω
+ᾪ    Ω
+ᾫ    Ω
+ᾬ    Ω
+ᾭ    Ω
+ᾮ    Ω
+ᾯ    Ω
+ᾰ    α
+ᾱ    α
+ᾲ    α
+ᾳ    α
+ᾴ    α
+ᾶ    α
+ᾷ    α
+Ᾰ    Α
+Ᾱ    Α
+Ὰ    Α
+ᾼ    Α
+ῂ    η
+ῃ    η
+ῄ    η
+ῆ    η
+ῇ    η
+Ὲ    Ε
+Ὴ    Η
+ῌ    Η
+ῐ    ι
+ῑ    ι
+ῒ    ι
+ῖ    ι
+ῗ    ι
+Ῐ    Ι
+Ῑ    Ι
+Ὶ    Ι
+ῠ    υ
+ῡ    υ
+ῢ    υ
+ῤ    ρ
+ῥ    ρ
+ῦ    υ
+ῧ    υ
+Ῠ    Υ
+Ῡ    Υ
+Ὺ    Υ
+Ῥ    Ρ
+ῲ    ω
+ῳ    ω
+ῴ    ω
+ῶ    ω
+ῷ    ω
+Ὸ    Ο
+Ὼ    Ω
+ῼ    Ω
  ‐    -
  ‑    -
  ‒    -
author	Thomas Munro <tmunro@postgresql.org>
	Sat, 1 Sep 2018 19:12:24 +0000 (07:12 +1200)
committer	Thomas Munro <tmunro@postgresql.org>
	Sat, 1 Sep 2018 19:12:24 +0000 (07:12 +1200)
contrib/unaccent/generate_unaccent_rules.py		patch \| blob \| blame \| history
contrib/unaccent/unaccent.rules		patch \| blob \| blame \| history