From 7e5e76b08e69db81ae703352f3e2070ff8966f44 Mon Sep 17 00:00:00 2001 From: pknowles Date: Mon, 15 May 2023 00:04:55 -0700 Subject: [PATCH 1/5] extras: adds special_attribute --- lib/markdown2.py | 59 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 54 insertions(+), 5 deletions(-) diff --git a/lib/markdown2.py b/lib/markdown2.py index 79ffbc46..4b1b3a23 100755 --- a/lib/markdown2.py +++ b/lib/markdown2.py @@ -1418,6 +1418,17 @@ def _sanitize_html(self, s): (?P.*?) \] ''', re.X | re.S) + _special_attribute = re.compile(r''' + [ \t]* + ( + (?<=[ \t{]) # must-be-separated look-behind + \#[A-Za-z][-A-Za-z0-9_:.]* # html id + | + \.-?[_a-zA-Z]+[_a-zA-Z0-9-]* # css class + | + ([A-Za-z]+)=([A-Za-z0-9%.]+) # simple attribute + ) + ''', re.X | re.S) _whitespace = re.compile(r'\s*') @@ -1465,6 +1476,32 @@ def _extract_url_and_title(self, text, start): url = self._strip_anglebrackets.sub(r'\1', url) return url, title, end_idx + def _extract_special_attributes(self, text, start, allowlist=None): + """Extracts the url and (optional) title from the tail of a link""" + # text[start] equals the opening parenthesis + idx = self._find_non_whitespace(text, start+1) + if idx == len(text): + return {}, start + end_idx = idx + end_idx = self._find_balanced(text, end_idx, "{", "}") + result = {} + classes = [] + for match in self._special_attribute.finditer(text, idx, end_idx): + if match.group(1)[0] == "#": + if allowlist is None or match.group(1) in allowlist: + result["id"] = match.group(1)[0][1:] + elif match.group(1)[0] == ".": + if allowlist is None or match.group(1) in allowlist: + classes += [match.group(1)[0][1:]] + elif match.group(2): + attribute = match.group(2) + value = match.group(3) + if allowlist is None or attribute in allowlist: + result[attribute] = _xml_escape_attr(value) + if len(classes): + result["class"] = " ".join(classes) + return result, end_idx + def _protect_url(self, url): ''' Function that passes a URL through `_html_escape_url` to remove any nasty characters, @@ -1568,6 +1605,10 @@ def _do_links(self, text): is_img = start_idx > 0 and text[start_idx-1] == "!" if is_img: start_idx -= 1 + attributes = {} + if 'special-attributes' in self.extras: + allowlist = self.extras['special-attributes'].get('img') if isinstance(self.extras, dict) else None + attributes, url_end_idx = self._extract_special_attributes(text, url_end_idx, allowlist) # We've got to encode these to avoid conflicting # with italics/bold. @@ -1582,11 +1623,12 @@ def _do_links(self, text): title_str = '' if is_img: img_class_str = self._html_class_str_from_tag("img") - result = '%s 0 and text[start_idx-1] == "!" if is_img: start_idx -= 1 + attributes = {} + if 'special-attributes' in self.extras: + allowlist = self.extras['special-attributes'].get('img') if isinstance(self.extras, dict) else None + attributes, consume_end = self._extract_special_attributes(text, consume_end, allowlist) link_id = match.group("id").lower() if not link_id: link_id = link_text.lower() # for links like [this][] @@ -1639,16 +1687,17 @@ def _do_links(self, text): title_str = '' if is_img: img_class_str = self._html_class_str_from_tag("img") - result = '%s= anchor_allowed_pos: if self.safe_mode and not self._safe_protocols.match(url): result_head = '' % (title_str) @@ -1661,13 +1710,13 @@ def _do_links(self, text): # anchor_allowed_pos on. curr_pos = start_idx + len(result_head) anchor_allowed_pos = start_idx + len(result) - text = text[:start_idx] + result + text[match.end():] + text = text[:start_idx] + result + text[consume_end:] else: # Anchor not allowed here. curr_pos = start_idx + 1 else: # This id isn't defined, leave the markup alone. - curr_pos = match.end() + curr_pos = consume_end continue # Otherwise, it isn't markup. From 1039949e654afd73fcc9e7359aa4751d6368b4d6 Mon Sep 17 00:00:00 2001 From: pknowles Date: Mon, 22 May 2023 12:28:53 -0700 Subject: [PATCH 2/5] python2 and unicode - isinstance() checks for both str and unicode objects - yield from to for loop - use .decode instead of str constructor --- lib/markdown2.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/lib/markdown2.py b/lib/markdown2.py index 4b1b3a23..a9ea4654 100755 --- a/lib/markdown2.py +++ b/lib/markdown2.py @@ -305,7 +305,7 @@ def convert(self, text): if not isinstance(text, str): # TODO: perhaps shouldn't presume UTF-8 for string input? - text = str(text, 'utf-8') + text = text.decode('utf-8') if self.use_file_vars: # Look for emacs-style file variable hints. @@ -741,7 +741,7 @@ def _detab(self, text): _html_markdown_attr_re = re.compile( r'''\s+markdown=("1"|'1')''') def _hash_html_block_sub(self, match, raw=False): - if isinstance(match, str): + if isinstance(match, (str, unicode)): html = match else: html = match.group(1) @@ -1740,7 +1740,7 @@ def header_id_from_text(self, text, prefix, n): the TOC (if the "toc" extra is specified). """ header_id = _slugify(text) - if prefix and isinstance(prefix, str): + if prefix and isinstance(prefix, (str, unicode)): header_id = prefix + '-' + header_id self._count_from_header_id[header_id] += 1 @@ -2001,7 +2001,8 @@ def _wrap_code(self, inner): def _add_newline(self, inner): # Add newlines around the inner contents so that _strict_tag_block_re matches the outer div. yield 0, "\n" - yield from inner + for i in inner: + yield i yield 0, "\n" def wrap(self, source, outfile=None): From 76c663dbb2b93edc9b343d6ff4e429d327aaf269 Mon Sep 17 00:00:00 2001 From: pknowles Date: Mon, 22 May 2023 14:03:08 -0700 Subject: [PATCH 3/5] add a more general url parser/sanitizer copies a url regex from pagedown (used by stackoverflow) --- lib/markdown2.py | 42 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/lib/markdown2.py b/lib/markdown2.py index a9ea4654..4d60e64a 100755 --- a/lib/markdown2.py +++ b/lib/markdown2.py @@ -1513,7 +1513,43 @@ def _protect_url(self, url): self._escape_table[url] = key return key - _safe_protocols = re.compile(r'(https?|ftp):', re.I) + # _safe_href is copied from pagedown's Markdown.Sanitizer.js + # Inlining the entire license as I don't have the time to add it properly for upstreaming + # From: https://github.com/StackExchange/pagedown/blob/master/LICENSE.txt + # + # A javascript port of Markdown, as used on Stack Overflow + # and the rest of Stack Exchange network. + # + # Largely based on showdown.js by John Fraser (Attacklab). + # + # Original Markdown Copyright (c) 2004-2005 John Gruber + # + # + # + # Original Showdown code copyright (c) 2007 John Fraser + # + # Modifications and bugfixes (c) 2009 Dana Robinson + # Modifications and bugfixes (c) 2009-2014 Stack Exchange Inc. + # + # Permission is hereby granted, free of charge, to any person obtaining a copy + # of this software and associated documentation files (the "Software"), to deal + # in the Software without restriction, including without limitation the rights + # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + # copies of the Software, and to permit persons to whom the Software is + # furnished to do so, subject to the following conditions: + # + # The above copyright notice and this permission notice shall be included in + # all copies or substantial portions of the Software. + # + # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + # THE SOFTWARE. + _safe_href = re.compile(r'^((https?|ftp):\/\/|\/|\.|#)[-A-Za-z0-9+&@#\/%?=~_|!:,.;\(\)*[\]$]+$', re.I) + def _do_links(self, text): """Turn Markdown link shortcuts into XHTML and tags. @@ -1636,7 +1672,7 @@ def _do_links(self, text): anchor_allowed_pos = start_idx + len(result) text = text[:start_idx] + result + text[url_end_idx:] elif start_idx >= anchor_allowed_pos: - safe_link = self._safe_protocols.match(url) or url.startswith('#') + safe_link = self._safe_href.match(url) if self.safe_mode and not safe_link: result_head = '' % (title_str) else: @@ -1699,7 +1735,7 @@ def _do_links(self, text): curr_pos = start_idx + len(result) text = text[:start_idx] + result + text[consume_end:] elif start_idx >= anchor_allowed_pos: - if self.safe_mode and not self._safe_protocols.match(url): + if self.safe_mode and not self._safe_href.match(url): result_head = '' % (title_str) else: result_head = '' % (self._protect_url(url), title_str) From ad4ee4e54ceb03a8223213b31990c7104823e1f1 Mon Sep 17 00:00:00 2001 From: pknowles Date: Mon, 22 May 2023 15:44:16 -0700 Subject: [PATCH 4/5] Reverts "Removed Python2 support." This reverts commit 77d5275adc832b8952600d780ff8a5ad6d028c99. --- CONTRIBUTORS.txt | 1 - TODO.txt | 1 + lib/markdown2.py | 36 +++++++++++++++++++++----- perf/gen_perf_cases.py | 22 ++++++++-------- perf/perf.py | 16 ++++++------ perf/util.py | 4 +-- sandbox/wiki.py | 2 +- test/markdown.py | 32 +++++++++++------------ test/test_markdown2.py | 47 ++++++++++++++++++++++++++++------ test/testall.py | 2 +- tools/tables-align-columns.py | 12 ++++----- tools/which.py | 10 ++++---- tools/wiki-tables-to-tables.py | 2 +- 13 files changed, 121 insertions(+), 66 deletions(-) diff --git a/CONTRIBUTORS.txt b/CONTRIBUTORS.txt index 5e0980ab..09c1cb86 100644 --- a/CONTRIBUTORS.txt +++ b/CONTRIBUTORS.txt @@ -52,7 +52,6 @@ Maximilian Hils (github.com/mhils) BarkeH (github.com/BarkeH) cav71 (github.com/cav71) Crozzers (github.com/Crozzers) -Bastian Venthur (https://github.com/venthur), removed Python2 support gitbra (github.com/gitbra) Ɓukasz Langa (github.com/ambv) Max Omdal (github.com/momja) diff --git a/TODO.txt b/TODO.txt index 9f8cbb9d..e6f880b2 100644 --- a/TODO.txt +++ b/TODO.txt @@ -1,3 +1,4 @@ +- py3: py2.4 test (broken?) - add "smarty-pants" extra to wiki - add "html-classes" extra to wiki - more on the "code-color" extra wiki page diff --git a/lib/markdown2.py b/lib/markdown2.py index 4d60e64a..09838c9d 100755 --- a/lib/markdown2.py +++ b/lib/markdown2.py @@ -112,6 +112,22 @@ import codecs from collections import defaultdict + +# ---- Python version compat + +# Use `bytes` for byte strings and `unicode` for unicode strings (str in Py3). +if sys.version_info[0] <= 2: + py3 = False + try: + bytes + except NameError: + bytes = str + base_string_type = basestring +elif sys.version_info[0] >= 3: + py3 = True + unicode = str + base_string_type = str + # ---- globals DEBUG = False @@ -303,9 +319,9 @@ def convert(self, text): # articles): self.reset() - if not isinstance(text, str): + if not isinstance(text, unicode): # TODO: perhaps shouldn't presume UTF-8 for string input? - text = text.decode('utf-8') + text = unicode(text, 'utf-8') if self.use_file_vars: # Look for emacs-style file variable hints. @@ -1776,7 +1792,7 @@ def header_id_from_text(self, text, prefix, n): the TOC (if the "toc" extra is specified). """ header_id = _slugify(text) - if prefix and isinstance(prefix, (str, unicode)): + if prefix and isinstance(prefix, base_string_type): header_id = prefix + '-' + header_id self._count_from_header_id[header_id] += 1 @@ -2776,7 +2792,7 @@ def indent(): return '\n'.join(lines) + '\n' -class UnicodeWithAttrs(str): +class UnicodeWithAttrs(unicode): """A subclass of unicode used for the return value of conversion to possibly attach some attributes. E.g. the "toc_html" attribute when the "toc" extra is used. @@ -3151,7 +3167,11 @@ def main(argv=None): p.stdin.write(text.encode('utf-8')) p.stdin.close() perl_html = p.stdout.read().decode('utf-8') - sys.stdout.write(perl_html) + if py3: + sys.stdout.write(perl_html) + else: + sys.stdout.write(perl_html.encode( + sys.stdout.encoding or "utf-8", 'xmlcharrefreplace')) print("==== markdown2.py ====") html = markdown(text, html4tags=opts.html4tags, @@ -3159,7 +3179,11 @@ def main(argv=None): extras=extras, link_patterns=link_patterns, use_file_vars=opts.use_file_vars, cli=True) - sys.stdout.write(html) + if py3: + sys.stdout.write(html) + else: + sys.stdout.write(html.encode( + sys.stdout.encoding or "utf-8", 'xmlcharrefreplace')) if extras and "toc" in extras: log.debug("toc_html: " + str(html.toc_html.encode(sys.stdout.encoding or "utf-8", 'xmlcharrefreplace'))) diff --git a/perf/gen_perf_cases.py b/perf/gen_perf_cases.py index 28b9e18c..2108f2a5 100755 --- a/perf/gen_perf_cases.py +++ b/perf/gen_perf_cases.py @@ -15,7 +15,7 @@ def gen_aspn_cases(limit=0): base_dir = TMP+'aspn-cases' if exists(base_dir): - print("'%s' exists, skipping" % base_dir) + print "'%s' exists, skipping" % base_dir return os.makedirs(base_dir) sys.stdout.write("generate %s" % base_dir); sys.stdout.flush() @@ -48,10 +48,10 @@ def gen_aspn_cases(limit=0): def gen_test_cases(): base_dir = TMP+"test-cases" if exists(base_dir): - print("'%s' exists, skipping" % base_dir) + print "'%s' exists, skipping" % base_dir return os.makedirs(base_dir) - print("generate %s" % base_dir) + print "generate %s" % base_dir for test_cases_dir in glob(join("..", "test", "*-cases")): for text_file in glob(join(test_cases_dir, "*.text")): shutil.copy(text_file, join(base_dir, basename(text_file))) @@ -134,7 +134,7 @@ def _markdown_from_aspn_html(html): try: idx = markdown.index(marker) except ValueError: - print("marker: %r" % marker) + print "marker: %r" % marker raise if not markdown[:idx].strip(): #TODO: Correct this false diagnosis. Problem is not limited @@ -191,8 +191,8 @@ def _dedentlines(lines, tabsize=8, skip_first_line=False): """ DEBUG = False if DEBUG: - print("dedent: dedent(..., tabsize=%d, skip_first_line=%r)"\ - % (tabsize, skip_first_line)) + print "dedent: dedent(..., tabsize=%d, skip_first_line=%r)"\ + % (tabsize, skip_first_line) indents = [] margin = None for i, line in enumerate(lines): @@ -209,12 +209,12 @@ def _dedentlines(lines, tabsize=8, skip_first_line=False): break else: continue # skip all-whitespace lines - if DEBUG: print("dedent: indent=%d: %r" % (indent, line)) + if DEBUG: print "dedent: indent=%d: %r" % (indent, line) if margin is None: margin = indent else: margin = min(margin, indent) - if DEBUG: print("dedent: margin=%r" % margin) + if DEBUG: print "dedent: margin=%r" % margin if margin is not None and margin > 0: for i, line in enumerate(lines): @@ -226,7 +226,7 @@ def _dedentlines(lines, tabsize=8, skip_first_line=False): elif ch == '\t': removed += tabsize - (removed % tabsize) elif ch in '\r\n': - if DEBUG: print("dedent: %r: EOL -> strip up to EOL" % line) + if DEBUG: print "dedent: %r: EOL -> strip up to EOL" % line lines[i] = lines[i][j:] break else: @@ -234,8 +234,8 @@ def _dedentlines(lines, tabsize=8, skip_first_line=False): "line %r while removing %d-space margin" % (ch, line, margin)) if DEBUG: - print("dedent: %r: %r -> removed %d/%d"\ - % (line, ch, removed, margin)) + print "dedent: %r: %r -> removed %d/%d"\ + % (line, ch, removed, margin) if removed == margin: lines[i] = lines[i][j+1:] break diff --git a/perf/perf.py b/perf/perf.py index ad500d8e..b2b04a0a 100755 --- a/perf/perf.py +++ b/perf/perf.py @@ -44,7 +44,7 @@ def time_markdown_py(cases_dir, repeat): pass end = clock() times.append(end - start) - print(" markdown.py: best of %d: %.3fs" % (repeat, min(times))) + print " markdown.py: best of %d: %.3fs" % (repeat, min(times)) @hotshotit def hotshot_markdown2_py(cases_dir, repeat): @@ -65,7 +65,7 @@ def time_markdown2_py(cases_dir, repeat): markdowner.convert(content) end = clock() times.append(end - start) - print(" markdown2.py: best of %d: %.3fs" % (repeat, min(times))) + print " markdown2.py: best of %d: %.3fs" % (repeat, min(times)) def time_markdown_pl(cases_dir, repeat): times = [] @@ -74,7 +74,7 @@ def time_markdown_pl(cases_dir, repeat): os.system('perl time_markdown_pl.pl "%s"' % cases_dir) end = clock() times.append(end - start) - print(" Markdown.pl: best of %d: %.3fs" % (repeat, min(times))) + print " Markdown.pl: best of %d: %.3fs" % (repeat, min(times)) def time_all(cases_dir, repeat): time_markdown_pl(cases_dir, repeat=repeat) @@ -130,10 +130,10 @@ def main(args=sys.argv): if timer_name not in d: raise ValueError("no '%s' timer function" % timer_name) timer = d[timer_name] - print("Profile conversion of %s (plat=%s):" \ - % (os.path.join(cases_dir, "*.text"), sys.platform)) + print "Profile conversion of %s (plat=%s):" \ + % (os.path.join(cases_dir, "*.text"), sys.platform) timer(cases_dir, repeat=opts.repeat) - print() + print os.system("python show_stats.py %s.prof" % timer_name) else: @@ -144,8 +144,8 @@ def main(args=sys.argv): if timer_name not in d: raise ValueError("no '%s' timer function" % timer_name) timer = d[timer_name] - print("Time conversion of %s (plat=%s):" \ - % (os.path.join(cases_dir, "*.text"), sys.platform)) + print "Time conversion of %s (plat=%s):" \ + % (os.path.join(cases_dir, "*.text"), sys.platform) timer(cases_dir, repeat=opts.repeat) if __name__ == "__main__": diff --git a/perf/util.py b/perf/util.py index e32d0f8b..4b52e6f8 100644 --- a/perf/util.py +++ b/perf/util.py @@ -30,14 +30,14 @@ def wrapper(*args, **kw): return func(*args, **kw) finally: total_time = clock() - start_time - print("%s took %.3fs" % (func.__name__, total_time)) + print "%s took %.3fs" % (func.func_name, total_time) return wrapper def hotshotit(func): def wrapper(*args, **kw): import hotshot global hotshotProfilers - prof_name = func.__name__+".prof" + prof_name = func.func_name+".prof" profiler = hotshotProfilers.get(prof_name) if profiler is None: profiler = hotshot.Profile(prof_name) diff --git a/sandbox/wiki.py b/sandbox/wiki.py index f270b636..ac27199a 100644 --- a/sandbox/wiki.py +++ b/sandbox/wiki.py @@ -18,4 +18,4 @@ ] processor = markdown2.Markdown(extras=["link-patterns"], link_patterns=link_patterns) -print(processor.convert(wiki_page)) +print processor.convert(wiki_page) diff --git a/test/markdown.py b/test/markdown.py index e18336b1..c76f8d95 100644 --- a/test/markdown.py +++ b/test/markdown.py @@ -32,13 +32,13 @@ import re, sys, codecs # Set debug level: 3 none, 2 critical, 1 informative, 0 all -(VERBOSE, INFO, CRITICAL, NONE) = list(range(4)) +(VERBOSE, INFO, CRITICAL, NONE) = range(4) MESSAGE_THRESHOLD = CRITICAL def message(level, text) : if level >= MESSAGE_THRESHOLD : - print(text) + print text # --------------- CONSTANTS YOU MIGHT WANT TO MODIFY ----------------- @@ -48,9 +48,9 @@ def message(level, text) : SMART_EMPHASIS = 1 # this_or_that does not become thisorthat HTML_REMOVED_TEXT = "[HTML_REMOVED]" # text used instead of HTML in safe mode -RTL_BIDI_RANGES = ( ('\u0590', '\u07FF'), +RTL_BIDI_RANGES = ( (u'\u0590', u'\u07FF'), # from Hebrew to Nko (includes Arabic, Syriac and Thaana) - ('\u2D30', '\u2D7F'), + (u'\u2D30', u'\u2D7F'), # Tifinagh ) @@ -62,9 +62,9 @@ def message(level, text) : # 0780-07BF - Thaana # 07C0-07FF - Nko -BOMS = { 'utf-8' : (str(codecs.BOM_UTF8, "utf-8"), ), - 'utf-16' : (str(codecs.BOM_UTF16_LE, "utf-16"), - str(codecs.BOM_UTF16_BE, "utf-16")), +BOMS = { 'utf-8' : (unicode(codecs.BOM_UTF8, "utf-8"), ), + 'utf-16' : (unicode(codecs.BOM_UTF16_LE, "utf-16"), + unicode(codecs.BOM_UTF16_BE, "utf-16")), #'utf-32' : (unicode(codecs.BOM_UTF32_LE, "utf-32"), # unicode(codecs.BOM_UTF32_BE, "utf-32")), } @@ -128,7 +128,7 @@ def getBidiType(text) : ch = text[0] - if not isinstance(ch, str) or not ch.isalpha(): + if not isinstance(ch, unicode) or not ch.isalpha(): return None else : @@ -312,7 +312,7 @@ def toxml(self): if self.nodeName in ['p', 'li', 'ul', 'ol', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'] : - if "dir" not in self.attribute_values: + if not self.attribute_values.has_key("dir"): if self.bidi : bidi = self.bidi else : @@ -788,7 +788,7 @@ def handleMatch(self, m, doc): # we'll use "google" as the id id = m.group(2).lower() - if id not in self.references : # ignore undefined refs + if not self.references.has_key(id) : # ignore undefined refs return None href, title = self.references[id] text = m.group(2) @@ -1127,7 +1127,7 @@ def registerExtensions(self, extensions, configs) : % (ext, extension_module_name) ) else : - if ext in configs : + if configs.has_key(ext) : configs_for_ext = configs[ext] else : configs_for_ext = [] @@ -1489,7 +1489,7 @@ def _handleInlineWrapper (self, line) : x = parts[i] - if isinstance(x, str) : + if isinstance(x, (str, unicode)) : result = self._applyPattern(x, pattern) if result : @@ -1502,7 +1502,7 @@ def _handleInlineWrapper (self, line) : for i in range(len(parts)) : x = parts[i] - if isinstance(x, str) : + if isinstance(x, (str, unicode)) : parts[i] = self.doc.createTextNode(x) return parts @@ -1577,7 +1577,7 @@ def _applyPattern(self, line, pattern) : for item in result: - if isinstance(item, str): + if isinstance(item, (str, unicode)): if len(item) > 0: node.insertChild(position, self.doc.createTextNode(item)) @@ -1723,7 +1723,7 @@ def __init__(self, configs = {}) : self.config = configs def getConfig(self, key) : - if key in self.config : + if self.config.has_key(key) : return self.config[key][0] else : return "" @@ -1757,7 +1757,7 @@ def parse_options() : 'encoding' : None } else : - print(OPTPARSE_WARNING) + print OPTPARSE_WARNING return None parser = optparse.OptionParser(usage="%prog INPUTFILE [options]") diff --git a/test/test_markdown2.py b/test/test_markdown2.py index cc2a80b2..003f7f4b 100755 --- a/test/test_markdown2.py +++ b/test/test_markdown2.py @@ -21,6 +21,26 @@ finally: del sys.path[0] + + +#---- Python version compat + +# Use `bytes` for byte strings and `unicode` for unicode strings (str in Py3). +if sys.version_info[0] <= 2: + py3 = False + try: + bytes + except NameError: + bytes = str + base_string_type = basestring +elif sys.version_info[0] >= 3: + py3 = True + unicode = str + base_string_type = str + unichr = chr + + + #---- Test cases class _MarkdownTestCase(unittest.TestCase): @@ -103,8 +123,12 @@ def _assertMarkdown(self, text, html, text_path=None, html_path=None, def charreprreplace(exc): if not isinstance(exc, UnicodeEncodeError): raise TypeError("don't know how to handle %r" % exc) - obj_repr = repr(exc.object[exc.start:exc.end])[1:-1] - return (str(obj_repr), exc.end) + if py3: + obj_repr = repr(exc.object[exc.start:exc.end])[1:-1] + else: + # repr -> remote "u'" and "'" + obj_repr = repr(exc.object[exc.start:exc.end])[2:-1] + return (unicode(obj_repr), exc.end) codecs.register_error("charreprreplace", charreprreplace) self.assertEqual(python_norm_html, norm_html, errmsg) @@ -258,7 +282,7 @@ def test_pre(self): '

some starter text

\n\n
#!/usr/bin/python\nprint "hi"\n
\n') def test_russian(self): - ko = '\\u043b\\u0449' # 'ko' on russian keyboard + ko = '\u043b\u0449' # 'ko' on russian keyboard self._assertMarkdown("## %s" % ko, '

%s

\n' % ko) test_russian.tags = ["unicode", "issue3"] @@ -315,6 +339,13 @@ def test_api(self): test = doctest.DocFileTest("api.doctests") test.runTest() + # Don't bother on Python 3 because (a) there aren't many inline doctests, + # and (b) they are more to be didactic than comprehensive test suites. + if not py3: + def test_internal(self): + doctest.testmod(markdown2) + + #---- internal support stuff @@ -322,9 +353,9 @@ def test_api(self): def _xml_escape_sub(match): escape = match.group(1) if escape[0] == 'x': - return chr(int('0'+escape, base=16)) + return unichr(int('0'+escape, base=16)) else: - return chr(int(escape)) + return unichr(int(escape)) _markdown_email_link_re = re.compile(r'
(.*?)', re.U) def _markdown_email_link_sub(match): @@ -341,7 +372,7 @@ def norm_html_from_html(html): Also normalize EOLs. """ - if not isinstance(html, str): + if not isinstance(html, unicode): html = html.decode('utf-8') html = _markdown_email_link_re.sub( _markdown_email_link_sub, html) @@ -352,7 +383,7 @@ def norm_html_from_html(html): def _display(s): """Markup the given string for useful display.""" - if not isinstance(s, str): + if not isinstance(s, unicode): s = s.decode("utf-8") s = _indent(_escaped_text_from_text(s, "whitespace"), 4) if not s.endswith('\n'): @@ -499,7 +530,7 @@ def _escaped_text_from_text(text, escapes="eol"): # - Add _escaped_html_from_text() with a similar call sig. import re - if isinstance(escapes, str): + if isinstance(escapes, base_string_type): if escapes == "eol": escapes = {'\r\n': "\\r\\n\r\n", '\n': "\\n\n", '\r': "\\r\r"} elif escapes == "whitespace": diff --git a/test/testall.py b/test/testall.py index e26856ed..b236cbbc 100644 --- a/test/testall.py +++ b/test/testall.py @@ -3,7 +3,7 @@ # Run the test suite against all the Python versions we can find. # - +from __future__ import print_function import sys import os diff --git a/tools/tables-align-columns.py b/tools/tables-align-columns.py index f3535928..64e230e5 100755 --- a/tools/tables-align-columns.py +++ b/tools/tables-align-columns.py @@ -8,7 +8,7 @@ - Can't handle tables where cells have a pipe. """ - +from __future__ import print_function __version__ = "1.0.0" @@ -87,17 +87,17 @@ def _table_sub(match): width = width_from_col_idx[col_idx] align = align_from_col_idx[col_idx] if align == 'center': - underline.append(':' + '-'*(width-2) + ':') + underline.append(':' + u'-'*(width-2) + ':') elif align == 'right': - underline.append('-'*(width-1) + ':') + underline.append(u'-'*(width-1) + ':') elif align == 'left': - underline.append(':' + '-'*(width-1)) + underline.append(':' + u'-'*(width-1)) else: - underline.append('-'*width) + underline.append(u'-'*width) table[1:1] = [underline] #e(pformat(table, width=200)) - table_str = '\n'.join(('| ' + ' | '.join(r) + ' |') for r in table) + table_str = u'\n'.join(('| ' + u' | '.join(r) + ' |') for r in table) return table_str + '\n' text = codecs.open(path, 'rb', 'utf8').read() diff --git a/tools/which.py b/tools/which.py index eeb6b148..83e431b9 100755 --- a/tools/which.py +++ b/tools/which.py @@ -33,7 +33,7 @@ from HKLM\SOFTWARE\...\perl.exe """ - +from __future__ import print_function _cmdlnUsage = """ Show the full path of commands. @@ -93,13 +93,13 @@ def _getRegisteredExecutable(exeName): if sys.platform.startswith('win'): if os.path.splitext(exeName)[1].lower() != '.exe': exeName += '.exe' - import winreg + import _winreg try: key = "SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\App Paths\\" +\ exeName - value = winreg.QueryValue(winreg.HKEY_LOCAL_MACHINE, key) + value = _winreg.QueryValue(_winreg.HKEY_LOCAL_MACHINE, key) registered = (value, "from HKLM\\"+key) - except winreg.error: + except _winreg.error: pass if registered and not os.path.exists(registered[0]): registered = None @@ -252,7 +252,7 @@ def which(command, path=None, verbose=0, exts=None): If no match is found for the command, a WhichError is raised. """ try: - match = next(whichgen(command, path, verbose, exts)) + match = whichgen(command, path, verbose, exts).next() except StopIteration: raise WhichError("Could not find '%s' on the path." % command) return match diff --git a/tools/wiki-tables-to-tables.py b/tools/wiki-tables-to-tables.py index 8c3b3fb0..b70e20c7 100755 --- a/tools/wiki-tables-to-tables.py +++ b/tools/wiki-tables-to-tables.py @@ -29,7 +29,7 @@ mode... and the only supported mode for now. """ - +from __future__ import print_function __version__ = "1.0.0" From f9d6611496f1e23e3d463a4653142059d5768878 Mon Sep 17 00:00:00 2001 From: pknowles Date: Mon, 22 May 2023 17:31:32 -0700 Subject: [PATCH 5/5] fix special_attributes consuming all text in a paragraph --- lib/markdown2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/markdown2.py b/lib/markdown2.py index 09838c9d..1a64d5b9 100755 --- a/lib/markdown2.py +++ b/lib/markdown2.py @@ -1496,7 +1496,7 @@ def _extract_special_attributes(self, text, start, allowlist=None): """Extracts the url and (optional) title from the tail of a link""" # text[start] equals the opening parenthesis idx = self._find_non_whitespace(text, start+1) - if idx == len(text): + if idx == len(text) or text[start] != "{": return {}, start end_idx = idx end_idx = self._find_balanced(text, end_idx, "{", "}")