unaccent: Make generate_unaccent_rules.py Python 3 compatible

author Peter Eisentraut <peter@eisentraut.org>

Fri, 4 Jan 2019 10:12:31 +0000 (11:12 +0100)

committer Peter Eisentraut <peter@eisentraut.org>

Fri, 4 Jan 2019 10:12:31 +0000 (11:12 +0100)
author Peter Eisentraut <peter@eisentraut.org>
Fri, 4 Jan 2019 10:12:31 +0000 (11:12 +0100)
committer Peter Eisentraut <peter@eisentraut.org>
Fri, 4 Jan 2019 10:12:31 +0000 (11:12 +0100)
diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py

index 859cac40fa1cdd466061ac1dbd5e5fc698f7c19a..c9aef490aef1feeb15b75786ae3db62ded3068b0 100644 (file)
--- a/contrib/unaccent/generate_unaccent_rules.py
+++ b/contrib/unaccent/generate_unaccent_rules.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python2
+#!/usr/bin/python
  # -*- coding: utf-8 -*-
  #
  # This script builds unaccent.rules on standard output when given the
@@ -23,6 +23,24 @@
  # [1] http://unicode.org/Public/8.0.0/ucd/UnicodeData.txt
  # [2] http://unicode.org/cldr/trac/export/12304/tags/release-28/common/transforms/Latin-ASCII.xml
  
+# BEGIN: Python 2/3 compatibility - remove when Python 2 compatibility dropped
+# The approach is to be Python3 compatible with Python2 "backports".
+from __future__ import print_function
+from __future__ import unicode_literals
+import codecs
+import sys
+
+if sys.version_info[0] <= 2:
+    # Encode stdout as UTF-8, so we can just print to it
+    sys.stdout = codecs.getwriter('utf8')(sys.stdout)
+
+    # Map Python 2's chr to unichr
+    chr = unichr
+
+    # Python 2 and 3 compatible bytes call
+    def bytes(source, encoding='ascii', errors='strict'):
+        return source.encode(encoding=encoding, errors=errors)
+# END: Python 2/3 compatibility - remove when Python 2 compatibility dropped
  
  import re
  import argparse
@@ -39,7 +57,7 @@ PLAIN_LETTER_RANGES = ((ord('a'), ord('z')), # Latin lower case
                         (0x0391, 0x03a9))     # GREEK CAPITAL LETTER ALPHA, GREEK CAPITAL LETTER OMEGA
  
  def print_record(codepoint, letter):
-    print (unichr(codepoint) + "\t" + letter).encode("UTF-8")
+    print (chr(codepoint) + "\t" + letter)
  
  class Codepoint:
      def __init__(self, id, general_category, combining_ids):
@@ -116,7 +134,7 @@ def parse_cldr_latin_ascii_transliterator(latinAsciiFilePath):
      charactersSet = set()
  
      # RegEx to parse rules
-    rulePattern = re.compile(ur'^(?:(.)|(\\u[0-9a-fA-F]{4})) \u2192 (?:\'(.+)\'|(.+)) ;')
+    rulePattern = re.compile(r'^(?:(.)|(\\u[0-9a-fA-F]{4})) \u2192 (?:\'(.+)\'|(.+)) ;')
  
      # construct tree from XML
      transliterationTree = ET.parse(latinAsciiFilePath)
@@ -134,7 +152,7 @@ def parse_cldr_latin_ascii_transliterator(latinAsciiFilePath):
          # Group 3: plain "trg" char. Empty if group 4 is not.
          # Group 4: plain "trg" char between quotes. Empty if group 3 is not.
          if matches is not None:
-            src = matches.group(1) if matches.group(1) is not None else matches.group(2).decode('unicode-escape')
+            src = matches.group(1) if matches.group(1) is not None else bytes(matches.group(2), 'UTF-8').decode('unicode-escape')
              trg = matches.group(3) if matches.group(3) is not None else matches.group(4)
  
              # "'" and """ are escaped
@@ -195,10 +213,10 @@ def main(args):
             len(codepoint.combining_ids) > 1:
              if is_letter_with_marks(codepoint, table):
                  charactersSet.add((codepoint.id,
-                             unichr(get_plain_letter(codepoint, table).id)))
+                             chr(get_plain_letter(codepoint, table).id)))
              elif args.noLigaturesExpansion is False and is_ligature(codepoint, table):
                  charactersSet.add((codepoint.id,
-                             "".join(unichr(combining_codepoint.id)
+                             "".join(chr(combining_codepoint.id)
                                       for combining_codepoint \
                                       in get_plain_letters(codepoint, table))))
author	Peter Eisentraut <peter@eisentraut.org>
	Fri, 4 Jan 2019 10:12:31 +0000 (11:12 +0100)
committer	Peter Eisentraut <peter@eisentraut.org>
	Fri, 4 Jan 2019 10:12:31 +0000 (11:12 +0100)