diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 00000000..62104894 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,27 @@ +--- +name: Bug report +about: Report a parsing error, unexpected output and other bugs +title: '' +labels: bug +assignees: '' + +--- + +**Describe the bug** +A clear and concise description of what the bug is. + +**To Reproduce** +Attach a minimal markdown snippet that causes the bug to occur. This should be placed inside a fenced code block to escape GitHub's formatting. + +If your snippet contains fenced code blocks then you can escape them by adding more backticks to the enclosing block. See the [this GitHub article](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/creating-and-highlighting-code-blocks#fenced-code-blocks) for an example. + +**Expected behavior** +A clear and concise description of what you expected to happen. + +**Debug info** +Version of library being used: + +Any extras being used: + +**Additional context** +Add any other context about the problem here. diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml new file mode 100644 index 00000000..40ce721a --- /dev/null +++ b/.github/workflows/python.yaml @@ -0,0 +1,32 @@ +name: PythonCI +on: + push: + branches: [ master ] + pull_request: + branches: [ master ] +jobs: + build: + runs-on: ${{ matrix.os }} + strategy: + matrix: + python-version: ["3.9", "3.10", "3.11", "3.12", "3.13", "3.14"] + os: + - ubuntu-latest + - macos-latest + - windows-latest + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install .[all] + - name: Test + run: | + make testone + - name: Test ReDoS + run: | + make testredos diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 543f2ffb..00000000 --- a/.travis.yml +++ /dev/null @@ -1,11 +0,0 @@ -language: python -python: - - "pypy" - - "3.5" - - "3.6" - - "3.7" - - "3.8" -# command to install dependencies -install: pip install Pygments>=2.5.2 -# command to run tests -script: make testone diff --git a/CHANGES.md b/CHANGES.md index 2cb9fb3a..718fb9ed 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,10 +1,223 @@ # python-markdown2 Changelog -## python-markdown2 2.3.10 (not yet released) +## python-markdown2 2.5.6 (not yet released) + +- [pull #687] Fix AssertionError hashing HTML blocks spread over multiple lines (#686) +- [pull #692] Fix XSS from code spans in link titles (#691) +- [pull #695] Fix XSS issue from incomplete tags with no attributes (#694) +- [pull #700] Fix XSS from code spans in image alt text (#699) +- [pull #701] Allow boolean attribute syntax in `markdown-in-html` extra +- [pull #704] Fix XSS from smuggling spans into image attributes (#702, #703) + + +## python-markdown2 2.5.5 + +- [pull #639] Fix middle-word-em interfering with strongs (#637) +- [pull #640] Fix code friendly extra stopping other syntax being processed (#638) +- [pull #644] Fix a number of em/strong issues (#641, #642, #643) +- [pull #659] Fix a number of safemode issues (#647) +- [pull #665] Rewrite emphasis and strong processing to be more GFM compliant +- [pull #672] Fix nested footnote references (#664) +- [pull #681] Forbid square brackets in reference link IDs + + +## python-markdown2 2.5.4 + +- [pull #617] Add MarkdownFileLinks extra (#528) +- [pull #622] Add missing block tags to regex (#620) +- [pull #623] Don't escape plus signs in URLs (#621) +- [pull #626] Fix XSS when encoding incomplete tags (#625) +- [pull #628] Fix TypeError in MiddleWordEm extra when options was None (#627) +- [pull #630] Fix nbsp breaking tables (#629) +- [pull #634] Fix ReDoS in HTML tokenizer regex (#633) + + +## python-markdown2 2.5.3 + +- [pull #616] make tables without body gfm compatible + + +## python-markdown2 2.5.2 + +- [pull #605] Add support for Python 3.13, drop EOL 3.8 +- [pull #607] Fix `middle-word-em` extra preventing strongs from being recognized (#606) +- [pull #609] Add option to output to file in CLI (#608) +- [pull #612] Fix footnote labels appearing out-of-order (#536) +- [pull #613] Fix smarty pants extra not triggering when it should (#611) + + +## python-markdown2 2.5.1 + +- [pull #590] Fix underscores within bold text getting emphasized (#589) +- [pull #591] Add Alerts extra +- [pull #595] Fix img alt text being processed as markdown (#594) +- [pull #598] Add `link-shortrefs` extra (#597) +- [pull #600] Use urandom for SECRET_SALT +- [pull #602] Fix XSS issue in safe mode (#601) +- [pull #604] Fix XSS injection in image URLs (#603) + + +## python-markdown2 2.5.0 + +- [pull #519] Add support for custom extras +- [pull #519] Drop Python 3.5 support +- [pull #568] Add `prepend` arg to toc extra (#397) +- [pull #569] Process HTML comments as markdown in 'escape' safe mode +- [pull #570] Fix syntax warnings in test suite +- [pull #572] Process inline tags as HTML blocks when they span multiple lines (#571) +- [pull #573] Add new LaTeX Extra +- [pull #576] Fix `html`, `head` and `body` tags being wrapped in `
` tags (#575)
+- [pull #578] Graceful handling of broken lists when cuddled-lists extra is enabled
+- [pull #581] Add type hints (#562)
+- [pull #581] Drop Python 3.6 and 3.7 support
+- [pull #582] Fix fenced code blocks breaking lists (#580)
+- [pull #586] Fix #583 by tweaking incomplete tag regex
+- [pull #587] Fix AssertionError on malformed HTML (#584)
+
+
+## python-markdown2 2.4.13
+
+- [pull #559] Allow cuddled tables (#557)
+- [pull #560] Fix `markdown-in-html` not always splitting HTML tags into separate lines (#558)
+- [pull #564] Fix incomplete comments in safe mode not being escaped (#563)
+- [pull #566] Fix crash in `markdown-in-html` extra (#565)
+
+
+## python-markdown2 2.4.12
+
+- [pull #547] Update `markdown-in-html` extra to handle markdown on same line as HTML (#546)
+- [pull #550] Fix tables with trailing whitespace not being recognized (#549)
+- [pull #545] Fix multiple instances of strong emphasis (`**`) in one line (#541)
+- [pull #556] Fix incorrect parsing of links after square brackets (#552)
+
+## python-markdown2 2.4.11
+
+- [pull #524] Fix angles being escaped in style blocks (issue #523)
+- [pull #527] Fix base64 images being corrupted in safe mode (issue #526)
+- [pull #529] Add `breaks` extra with ability to hard break on backslashes (issue #525)
+- [pull #532] Fix #493 persisting when `code-friendly` extra enabled
+- [pull #535] Update `_slugify` to use utf-8 encoding (issue #534)
+- [pull #536] Maintain order of appearance in footnotes
+- [pull #538] Include HTML headers in TOC
+- [pull #540] Add mechanism to prevent header ID counter resetting (issue #530)
+
+## python-markdown2 2.4.10
+
+- [pull #520] Allow more relative links in safe mode (issue #517)
+- [pull #521] Always restore hashed HTML blocks (issue #185)
+- [pull #522] Add `middle-word-em` extra
+
+
+## python-markdown2 2.4.9
+
+- [pull #500] Add `` tag to html-classes extra
+- [pull #501] Fix link patterns extra matching against internal hashes
+- [pull #502] Replace deprecated `optparse` with `argparse`
+- [pull #506] Fix `_uniform_outdent` failing with empty strings (issue #505)
+- [pull #509] Fix HTML elements not unhashing correctly (issue 508)
+- [pull #511] Remove deprecated `imp` module (issue #510)
+- [pull #512] Allow link patterns to be passed via extras dict
+- [pull #513] Fix relative links not working in safe mode (issue #254)
+
+
+## python-markdown2 2.4.8
+
+- [pull #499] Fix images not being procesed correctly (#498)
+
+
+## python-markdown2 2.4.7
+
+- [pull #483] Fix hashing nested HTML blocks
+- [pull #486] Fix backslash being unable to escape raw HTML tags
+- [pull #482] Add support for telegram spoiler in extras
+- [pull #485] mermaid support
+- [pull #487] Fix escaping ampersands in hrefs
+- [pull #490] Fix indented codeblocks inside fences (#489)
+- [pull #490] Remove `code-color` extra
+
+
+## python-markdown2 2.4.6
+
+- [pull #477] Feature wavedrom support
+- [pull #480] Fix mixing ordered and un-ordered lists combining into single list type
+
+
+## python-markdown2 2.4.5
+
+- [pull #466] Add optional dependencies to `setup.py`
+
+
+## python-markdown2 2.4.4
+
+- [pull #439] Fix TypeError if html-classes extra is None
+- [pull #441] Remove Python2 support
+- [pull #445] Replace ` boo! boo! boo! boo! boom! boom! boo! boo! tags.
graf = self._run_span_gamut(graf)
@@ -2114,7 +2144,7 @@ def _form_paragraphs(self, text):
return "\n\n".join(grafs)
- def _add_footnotes(self, text):
+ def _add_footnotes(self, text: str) -> str:
if self.footnotes:
footer = [
'
+ \[!(?P {contents}\n %s\n Link: http://example.com/. With an ampersand: http://example.com/?foo=1&bar=2 With an ampersand: http://example.com/?foo=1&bar=2 Right paren: ) Greater-than: > Greater-than: > Hash: # This is strong and em. This is strong and em. So is this word. So is this word. This is strong and em. This is strong and em. So is this word. So is this word. some starter text max_time + tolerance:
- raise DurationError(('Test was too long (%.2f s)'
- % total_time))
+ raise DurationError('Test was too long (%.2f s)'
+ % total_time)
return wrapper
return _timedtest
@@ -144,7 +140,7 @@ def wrapper(*args, **kw):
#---- module api
-class Test(object):
+class Test:
def __init__(self, ns, testmod, testcase, testfn_name,
testsuite_class=None):
self.ns = ns
@@ -238,13 +234,12 @@ def testmods_from_testdir(testdir):
testmod_name = splitext(basename(testmod_path))[0]
log.debug("import test module '%s'", testmod_path)
try:
- iinfo = imp.find_module(testmod_name, [dirname(testmod_path)])
testabsdir = abspath(testdir)
sys.path.insert(0, testabsdir)
old_dir = os.getcwd()
os.chdir(testdir)
try:
- testmod = imp.load_module(testmod_name, *iinfo)
+ testmod = importlib.import_module(testmod_name)
finally:
os.chdir(old_dir)
sys.path.remove(testabsdir)
@@ -444,7 +439,7 @@ def list_tests(testdir_from_ns, tags):
if testfile.endswith(".pyc"):
testfile = testfile[:-1]
print("%s:" % t.shortname())
- print(" from: %s#%s.%s" % (testfile,
+ print(" from: {}#{}.{}".format(testfile,
t.testcase.__class__.__name__, t.testfn_name))
wrapped = textwrap.fill(' '.join(t.tags()), WIDTH-10)
print(" tags: %s" % _indent(wrapped, 8, True))
@@ -475,7 +470,7 @@ def __init__(self, stream):
def getDescription(self, test):
if test._testlib_explicit_tags_:
- return "%s [%s]" % (test._testlib_shortname_,
+ return "{} [{}]".format(test._testlib_shortname_,
', '.join(test._testlib_explicit_tags_))
else:
return test._testlib_shortname_
@@ -519,7 +514,7 @@ def printErrorList(self, flavour, errors):
self.stream.write("%s\n" % err)
-class ConsoleTestRunner(object):
+class ConsoleTestRunner:
"""A test runner class that displays results on the console.
It prints out the names of tests as they are run, errors as they
diff --git a/test/tm-cases/admonitions.html b/test/tm-cases/admonitions.html
new file mode 100644
index 00000000..e8a7d05e
--- /dev/null
+++ b/test/tm-cases/admonitions.html
@@ -0,0 +1,53 @@
+
+
+
+
+ Otherwise the text is no longer part of the admonition. Admonitions are able to contain fenced code blocks Useful information that users should know, even when skimming content. Helpful advice for doing things better or more easily. Key information users need to know to achieve their goal. Urgent info that needs immediate user attention to avoid problems. Advises about risks or negative outcomes of certain actions. one this is <strong>some strong</strong> text <strong>text</strong> text \with double\ escapes how about text \<strong>with triple\</strong> escapes escaped auto-link <https://www.example.com>
+not quite escaped auto link \https://www.example.com
+escaped auto-link \<https://www.example.com> <!-- and escaped HTML comment --> \ \<!--and another that is--> hello \world hello \world my favourite letter is hello \world my favourite code is hello \world my favourite letter is [HTML_REMOVED]alert(1)[HTML_REMOVED] link4 >[HTML_REMOVED]alert(1)[HTML_REMOVED] link4 >[HTML_REMOVED]alert(1)[HTML_REMOVED] <img src="javascript:alert(1)" <div>yowzer!</div> blah foo <!-- bar` with `` in strike extra
+- [pull #446] Fix link patterns extra applying within links
+- [pull #443] create proper entry point
+- [pull #449] Codespans inside link text issue344
+- [pull #451] Underline and HTML comments
+- [pull #453] Links with brackets
+- [pull #454] Fix emacs local variable one-liners
+- [pull #457] Example of the current mixed-paragraph mode behavior in lists
+- [pull #455] Fix code block indentation in lists
+- [pull #434] Fix filter bypass leading to XSS (#362)
+- [pull #464] Fix html-classes extra not applying to code spans
+- [pull #462] Fix pygments block matching
+- [pull #462] Fix pyshell blocks in blockquotes
+- [pull #463] Fix multilevel lists
+- [pull #468] Remove `_uniform_outdent_limit` function
+- [pull #470] Add support for ordered lists that don't start at 1. (#469)
+- [pull #472] Fix `AssertionError` with lazy numbered lists (issue #471)
+- [pull #475] Add `` and `
` tags to html-classes extra (#352)
+- [pull #473] XSS test and fix
+
+
+## python-markdown2 2.4.3
+
+- [pull #413] Fix meta indentation
+- [pull #414] Fix code surrounded by blank lines inside blockquote fenced code blocks
+- [pull #417] Fix inline code pipe symbol within tables (issue #399)
+- [pull #418] Fix code block parsing error (issue #327)
+- [pull #419] Fix hr block created when not supposed to (issue #400)
+- [pull #421] Fix backslashes removed by adjacent code blocks (issues #369 and #412)
+- [pull #420] Fix md5-* in resulting HTML when several code blocks follow one by one (issue #355)
+- [pull #422] Fix excessive `
` tags in lists using break-on-newline extra (issue #394)
+- [pull #424] Standardize key and value definitions for metadata extra (issue #423)
+- [pull #427] Fix fenced code blocks breaking lists (issue #426)
+- [pull #429] Fix catastrophic backtracking (Regex DoS) in pyshell blocks.
+- [pull #431] Fix incorrect indentation of fenced code blocks within lists
+- [pull #436] RST admonitions
+- [pull #430] Improve error message if link_patterns forgotten
+- [pull #437] fix compatibility with pygments 2.12
+
+
+## python-markdown2 2.4.2
+
+- [pull #408] Fix for fenced code blocks issue #396
+- [pull #410] Be more strict on auto linking urls, RE DOS fix
+
+
+## python-markdown2 2.4.1
+
+- [pull #389] Tables extra: allow whitespace at the end of the underline row
+- [pull #392] Pyshell extra: enable syntax highlighting if `fenced-code-blocks` is loaded.
+- [pull #402] Regex DOS bandaid fix
+
+
+## python-markdown2 2.4.0
+
+- [pull #377] Fixed bug breaking strings elements in metadata lists
+- [pull #380] When rendering fenced code blocks, also add the `language-LANG` class
+- [pull #387] Regex DoS fixes
+
+
+## python-markdown2 2.3.10
- [pull #356] Don't merge sequential quotes into a single blockquote
- [pull #357] use style=text-align for table alignment
- [pull #360] introduce underline extra
+- [pull #368] Support for structured and nested values in metadata
+- [pull #371] add noopener to external links
## python-markdown2 2.3.9
diff --git a/CONTRIBUTORS.txt b/CONTRIBUTORS.txt
index b0a58b5a..0820b079 100644
--- a/CONTRIBUTORS.txt
+++ b/CONTRIBUTORS.txt
@@ -46,3 +46,19 @@ ryanvilbrandt (github.com/ryanvilbrandt)
Gareth Simpson (github.com/xurble)
Kat Hagan (github.com/codebykat)
Stɑrry Shivɑm (github.com/starry69)
+André Nasturas (github.com/andrenasturas)
+Denis Kasak (github.com/dkasak)
+Maximilian Hils (github.com/mhils)
+BarkeH (github.com/BarkeH)
+cav71 (github.com/cav71)
+Crozzers (github.com/Crozzers)
+Bastian Venthur (https://github.com/venthur), removed Python2 support
+gitbra (github.com/gitbra)
+Łukasz Langa (github.com/ambv)
+Max Omdal (github.com/momja)
+Kishore (github.com/jk6521)
+Ircama (github.com/Ircama)
+Ankit Mahato (github.com/animator)
+Eric Dufresne (github.com/edufresne)
+Lyra Rebane (github.com/rebane2001)
+Raul Bocanegra Algarra (github.com/raulbocanegra)
diff --git a/MANIFEST.in b/MANIFEST.in
index 3d26d254..ef4d68ce 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -6,6 +6,7 @@ include CHANGES.md
include Makefile
include setup.py
include setup.cfg
+include tox.ini
include bin/markdown2
include test/api.doctests
diff --git a/Makefile b/Makefile
index b6f88c9b..90a4fa0a 100644
--- a/Makefile
+++ b/Makefile
@@ -12,6 +12,14 @@ test:
testone:
cd test && python test.py -- -knownfailure
+.PHONY: testwarn
+testwarn:
+ cd test && python -Wd test.py -- -knownfailure
+
+.PHONY: testredos
+testredos:
+ python test/test_redos.py
+
.PHONY: pygments
pygments:
[[ -d deps/pygments ]] || ( \
diff --git a/README.md b/README.md
index 7c651636..f1e89c9f 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@ was written to closely match the behaviour of the original Perl-implemented
Markdown.pl. Markdown2 also comes with a number of extensions (called
"extras") for things like syntax coloring, tables, header-ids. See the
"Extra Syntax" section below. "markdown2" supports all Python versions
-2.6+ or 3.3+ (and pypy and jython, though I don't frequently test those).
+3.5+ (and pypy and jython, though I don't frequently test those).
There is another [Python
markdown.py](https://python-markdown.github.io/). However, at
@@ -28,14 +28,12 @@ your consideration.
Follow @trentmick
for updates to python-markdown2.
-Travis-ci.org test status: [](http://travis-ci.org/trentm/python-markdown2)
-
-
# Install
To install it in your Python installation run *one* of the following:
pip install markdown2
+ pip install markdown2[all] # to install all optional dependencies (eg: Pygments for code syntax highlighting)
pypm install markdown2 # if you use ActivePython (activestate.com/activepython)
easy_install markdown2 # if this is the best you have
python setup.py install
@@ -51,14 +49,14 @@ As a module:
```python
>>> import markdown2
>>> markdown2.markdown("*boo!*") # or use `html = markdown_path(PATH)`
-u'`-blocks, auto-linking patterns, table of contents,
diff --git a/TODO.txt b/TODO.txt
index e6f880b2..9f8cbb9d 100644
--- a/TODO.txt
+++ b/TODO.txt
@@ -1,4 +1,3 @@
-- py3: py2.4 test (broken?)
- add "smarty-pants" extra to wiki
- add "html-classes" extra to wiki
- more on the "code-color" extra wiki page
diff --git a/bin/markdown2 b/bin/markdown2
deleted file mode 100755
index a644b96b..00000000
--- a/bin/markdown2
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/usr/bin/env python
-
-import sys
-from os.path import join, dirname, exists
-
-# Use the local markdown2.py if we are in the source tree.
-source_tree_markdown2 = join(dirname(__file__), "..", "lib", "markdown2.py")
-if exists(source_tree_markdown2):
- sys.path.insert(0, dirname(source_tree_markdown2))
- try:
- from markdown2 import main
- finally:
- del sys.path[0]
-else:
- from markdown2 import main
-
-if __name__ == "__main__":
- sys.exit( main(sys.argv) )
diff --git a/lib/markdown2.py b/lib/markdown2.py
index e69a5612..dc698970 100755
--- a/lib/markdown2.py
+++ b/lib/markdown2.py
@@ -40,6 +40,12 @@
Supported extra syntax options (see -x|--extras option below and
see
))?(?({})\b>?)'.format('' if allow_indent else '0', current_tag), chunk
+ )
+ block += chunk
+
+ if is_markup:
+ if chunk.startswith('%s' % is_markup.group(1)):
+ tag_count -= 1
+ else:
+ # if close tag is in same line
+ if self._tag_is_closed(is_markup.group(3), chunk):
+ # we must ignore these
+ is_markup = None
+ else:
+ tag_count += 1
+ current_tag = is_markup.group(3)
+
+ if tag_count == 0:
+ if is_markup:
+ block = callback(block.rstrip('\n')) # remove trailing newline
+ current_tag = html_tags_re
+ result += block
+ block = ''
+
+ result += block
+
+ return result
+
+ def _tag_is_closed(self, tag_name: str, text: str) -> bool:
+ # check if number of open tags == number of close tags
+ if len(re.findall('<%s(?:.*?)>' % tag_name, text)) != text.count('%s>' % tag_name):
+ return False
+
+ # check that close tag position is AFTER open tag
+ close_index = text.find(f'{tag_name}')
+ open_index = text.find(f'<{tag_name}')
+ return open_index != -1 and close_index != -1 and open_index < close_index
+
+ @mark_stage(Stage.LINK_DEFS)
+ def _strip_link_definitions(self, text: str) -> str:
# Strips link definitions from text, stores the URLs and titles in
# hash references.
less_than_tab = self.tab_width - 1
@@ -821,7 +1164,7 @@ def _strip_link_definitions(self, text):
""" % less_than_tab, re.X | re.M | re.U)
return _link_def_re.sub(self._extract_link_def_sub, text)
- def _extract_link_def_sub(self, match):
+ def _extract_link_def_sub(self, match: re.Match[str]) -> str:
id, url, title = match.groups()
key = id.lower() # Link IDs are case-insensitive
self.urls[key] = self._encode_amps_and_angles(url)
@@ -829,65 +1172,7 @@ def _extract_link_def_sub(self, match):
self.titles[key] = title
return ""
- def _do_numbering(self, text):
- ''' We handle the special extension for generic numbering for
- tables, figures etc.
- '''
- # First pass to define all the references
- self.regex_defns = re.compile(r'''
- \[\#(\w+)\s* # the counter. Open square plus hash plus a word \1
- ([^@]*)\s* # Some optional characters, that aren't an @. \2
- @(\w+) # the id. Should this be normed? \3
- ([^\]]*)\] # The rest of the text up to the terminating ] \4
- ''', re.VERBOSE)
- self.regex_subs = re.compile(r"\[@(\w+)\s*\]") # [@ref_id]
- counters = {}
- references = {}
- replacements = []
- definition_html = '
when True
+ - on_backslash: Replace backslashes at the end of a line with
+* break-on-newline: Alias for the on_newline option in the breaks extra.
* code-friendly: Disable _ and __ for em and strong.
* cuddled-lists: Allow lists to be cuddled to the preceding paragraph.
* fenced-code-blocks: Allows a code block to not have to be indented
@@ -54,16 +60,23 @@
highlighting when using fenced-code-blocks and highlightjs.
* html-classes: Takes a dict mapping html tag names (lowercase) to a
string to use for a "class" tag attribute. Currently only supports "img",
- "table", "pre" and "code" tags. Add an issue if you require this for other
- tags.
+ "table", "thead", "pre", "code", "ul" and "ol" tags. Add an issue if you require
+ this for other tags.
* link-patterns: Auto-link given regex patterns in text (e.g. bug number
references, revision number references).
-* markdown-in-html: Allow the use of `markdown="1"` in a block HTML tag to
- have markdown processing be done on its contents. Similar to
+* link-shortrefs: allow shortcut reference links, not followed by `[]` or
+ a link label.
+* markdown-file-links: Replace links to `.md` files with `.html` links
+* markdown-in-html: Allow the use of `markdown="1"` or simply a `markdown` boolean
+ attribute in a block HTML tag to have markdown processing be done on its
+ contents. Similar to
. It was easier to make a special
# case than to make the other regex more complicated.
if "
str:
+ '''
+ Finds and substitutes HTML blocks within blocks of text
+
+ Args:
+ text: the text to search
+ html_tags_re: a regex pattern of HTML block tags to match against.
+ For example, `Markdown._block_tags_a`
+ callback: callback function that receives the found HTML text block and returns a new str
+ allow_indent: allow matching HTML blocks that are not completely outdented
+ '''
+ tag_count = 0
+ current_tag = html_tags_re
+ block = ''
+ result = ''
+
+ for chunk in text.splitlines(True):
+ is_markup = re.match(
+ r'^(\s{{0,{}}})(?:(?=']
- cols = [re.sub(escape_bar_re, '|', cell.strip()) for cell in re.split(split_bar_re, re.sub(trim_bar_re, "", re.sub(trim_space_re, "", head)))]
- for col_idx, col in enumerate(cols):
- hlines.append(' ')
- hlines.append('')
-
- # tbody
- hlines.append('')
- for line in body.strip('\n').split('\n'):
- hlines.append('%s ' % (
- align_from_col_idx.get(col_idx, ''),
- self._run_span_gamut(col)
- ))
- hlines.append('')
- cols = [re.sub(escape_bar_re, '|', cell.strip()) for cell in re.split(split_bar_re, re.sub(trim_bar_re, "", re.sub(trim_space_re, "", line)))]
- for col_idx, col in enumerate(cols):
- hlines.append(' ')
- hlines.append('')
- hlines.append('')
-
- return '\n'.join(hlines) + '\n'
-
- def _do_tables(self, text):
- """Copying PHP-Markdown and GFM table syntax. Some regex borrowed from
- https://github.com/michelf/php-markdown/blob/lib/Michelf/Markdown.php#L2538
- """
- less_than_tab = self.tab_width - 1
- table_re = re.compile(r'''
- (?:(?<=\n\n)|\A\n?) # leading blank line
-
- ^[ ]{0,%d} # allowed whitespace
- (.*[|].*) \n # $1: header row (at least one pipe)
-
- ^[ ]{0,%d} # allowed whitespace
- ( # $2: underline row
- # underline row with leading bar
- (?: \|\ *:?-+:?\ * )+ \|? \n
- |
- # or, underline row without leading bar
- (?: \ *:?-+:?\ *\| )+ (?: \ *:?-+:?\ * )? \n
- )
-
- ( # $3: data rows
- (?:
- ^[ ]{0,%d}(?!\ ) # ensure line begins with 0 to less_than_tab spaces
- .*\|.* \n
- )+
- )
- ''' % (less_than_tab, less_than_tab, less_than_tab), re.M | re.X)
- return table_re.sub(self._table_sub, text)
-
- def _wiki_table_sub(self, match):
- ttext = match.group(0).strip()
- # print('wiki table: %r' % match.group(0))
- rows = []
- for line in ttext.splitlines(0):
- line = line.strip()[2:-2].strip()
- row = [c.strip() for c in re.split(r'(?' % self._html_class_str_from_tag('table'))
- # Check if first cell of first row is a header cell. If so, assume the whole row is a header row.
- if rows and rows[0] and re.match(r"^\s*~", rows[0][0]):
- add_hline('', 1)
- add_hline('%s ' % (
- align_from_col_idx.get(col_idx, ''),
- self._run_span_gamut(col)
- ))
- hlines.append('', 2)
- for cell in rows[0]:
- add_hline(" ', 2)
- add_hline('', 1)
- # Only one header row allowed.
- rows = rows[1:]
- # If no more rows, don't create a tbody.
- if rows:
- add_hline('', 1)
- for row in rows:
- add_hline('{} ".format(format_cell(cell)), 3)
- add_hline('', 2)
- for cell in row:
- add_hline(' ', 2)
- add_hline('', 1)
- add_hline('')
- return '\n'.join(hlines) + '\n'
-
- def _do_wiki_tables(self, text):
- # Optimization.
- if "||" not in text:
- return text
-
- less_than_tab = self.tab_width - 1
- wiki_table_re = re.compile(r'''
- (?:(?<=\n\n)|\A\n?) # leading blank line
- ^([ ]{0,%d})\|\|.+?\|\|[ ]*\n # first line
- (^\1\|\|.+?\|\|\n)* # any number of subsequent lines
- ''' % less_than_tab, re.M | re.X)
- return wiki_table_re.sub(self._wiki_table_sub, text)
-
- def _run_span_gamut(self, text):
+ @mark_stage(Stage.SPAN_GAMUT)
+ def _run_span_gamut(self, text: str) -> str:
# These are all the transformations that occur *within* block-level
# tags like paragraphs, headers, and list items.
@@ -1134,9 +1257,6 @@ def _run_span_gamut(self, text):
text = self._escape_special_chars(text)
# Process anchor and image tags.
- if "link-patterns" in self.extras:
- text = self._do_link_patterns(text)
-
text = self._do_links(text)
# Make links out of things like `{} '.format(format_cell(cell)), 3)
- add_hline('
)", "
- |
- # auto-link (e.g., ':
+ peek_tokens = split_tokens[index: index + 3]
+ elif token == '':
+ peek_tokens = split_tokens[index - 2: index + 1]
+ else:
+ return False
+ except IndexError:
+ return False
+
+ return re.match(r'md5-[A-Fa-f0-9]{32}', ''.join(peek_tokens))
+
+ def _is_comment(token):
+ if self.safe_mode == 'replace':
+ # don't bother processing each section of comment in replace mode. Just do the whole thing
+ return
+ return re.match(r'()', token)
+
+ # protect raw code spans from processing, as they can often contain anything that looks like HTML and
+ # trips up the regex. These are encoded and processed later on anyway
+ code_hashes = {}
+ text = self._code_span_re.sub(
+ lambda m: self._hash_span(m.string[m.start(): m.end()], code_hashes),
+ text
+ )
tokens = []
- is_html_markup = False
- for token in self._sorta_html_tokenize_re.split(text):
- if is_html_markup and not _is_auto_link(token):
- sanitized = self._sanitize_html(token)
- key = _hash_text(sanitized)
- self.html_spans[key] = sanitized
- tokens.append(key)
+ split_tokens = self._sorta_html_tokenize_re.split(text)
+ index = 0
+ while index < len(split_tokens):
+ is_html_markup = index % 2 != 0
+ token = split_tokens[index]
+ is_code = _is_code_span(index, token)
+
+ if is_html_markup and not self._is_auto_link(token) and not is_code:
+ is_comment = _is_comment(token)
+ if is_comment:
+ tokens.append(self._hash_span(self._sanitize_html(is_comment.group(1))))
+ # sanitise but leave comment body intact for further markdown processing
+ tokens.append(self._sanitize_html(is_comment.group(2)))
+ tokens.append(self._hash_span(self._sanitize_html(is_comment.group(3))))
+ elif self._is_unescaped_re.match(token) is None:
+ # if the HTML is escaped then escape any special chars and add the token as-is
+ tokens.append(self._escape_special_chars(token))
+ else:
+ tokens.append(self._hash_span(self._sanitize_html(token)))
+ elif is_html_markup and is_code:
+ # code span contents are hashed, so should be safe to just add directly
+ tokens.extend(split_tokens[index: index + 3])
+ index += 3
+ continue
else:
tokens.append(self._encode_incomplete_tags(token))
- is_html_markup = not is_html_markup
- return ''.join(tokens)
+ index += 1
+
+ text = ''.join(tokens)
+ # put markdown code spans back into the text for processing
+ for key, code in code_hashes.items():
+ text = text.replace(key, code)
+ return text
+
+ def _unhash_html_spans(self, text: str, spans=True, code=False) -> str:
+ '''
+ Recursively unhash a block of text
- def _unhash_html_spans(self, text):
- for key, sanitized in list(self.html_spans.items()):
- text = text.replace(key, sanitized)
+ Args:
+ spans: unhash anything from `self.html_spans`
+ code: unhash code blocks
+ '''
+ orig = ''
+ while text != orig:
+ if spans:
+ for key, sanitized in list(self.html_spans.items()):
+ text = text.replace(key, sanitized)
+ if code:
+ for code, key in list(self._code_table.items()):
+ text = text.replace(key, code)
+ orig = text
return text
- def _sanitize_html(self, s):
+ def _sanitize_html(self, s: str) -> str:
if self.safe_mode == "replace":
return self.html_removed_text
elif self.safe_mode == "escape":
@@ -1265,7 +1460,7 @@ def _sanitize_html(self, s):
[ ]? # one optional space
(?:\n[ ]*)? # one optional newline followed by spaces
\[
- (?P tags.
This is a combination of Markdown.pl's _DoAnchors() and
@@ -1325,197 +1553,16 @@ def _do_links(self, text):
Markdown.pl because of the lack of atomic matching support in
Python's regex engine used in $g_nested_brackets.
"""
- MAX_LINK_TEXT_SENTINEL = 3000 # markdown2 issue 24
-
- # `anchor_allowed_pos` is used to support img links inside
- # anchors, but not anchors inside anchors. An anchor's start
- # pos must be `>= anchor_allowed_pos`.
- anchor_allowed_pos = 0
-
- curr_pos = 0
- while True: # Handle the next link.
- # The next '[' is the start of:
- # - an inline anchor: [text](url "title")
- # - a reference anchor: [text][id]
- # - an inline img: 
- # - a reference img: ![text][id]
- # - a footnote ref: [^id]
- # (Only if 'footnotes' extra enabled)
- # - a footnote defn: [^id]: ...
- # (Only if 'footnotes' extra enabled) These have already
- # been stripped in _strip_footnote_definitions() so no
- # need to watch for them.
- # - a link definition: [id]: url "title"
- # These have already been stripped in
- # _strip_link_definitions() so no need to watch for them.
- # - not markup: [...anything else...
- try:
- start_idx = text.index('[', curr_pos)
- except ValueError:
- break
- text_length = len(text)
-
- # Find the matching closing ']'.
- # Markdown.pl allows *matching* brackets in link text so we
- # will here too. Markdown.pl *doesn't* currently allow
- # matching brackets in img alt text -- we'll differ in that
- # regard.
- bracket_depth = 0
- for p in range(start_idx+1, min(start_idx+MAX_LINK_TEXT_SENTINEL,
- text_length)):
- ch = text[p]
- if ch == ']':
- bracket_depth -= 1
- if bracket_depth < 0:
- break
- elif ch == '[':
- bracket_depth += 1
- else:
- # Closing bracket not found within sentinel length.
- # This isn't markup.
- curr_pos = start_idx + 1
- continue
- link_text = text[start_idx+1:p]
-
- # Fix for issue 341 - Injecting XSS into link text
- if self.safe_mode:
- link_text = self._hash_html_spans(link_text)
- link_text = self._unhash_html_spans(link_text)
-
- # Possibly a footnote ref?
- if "footnotes" in self.extras and link_text.startswith("^"):
- normed_id = re.sub(r'\W', '-', link_text[1:])
- if normed_id in self.footnotes:
- self.footnote_ids.append(normed_id)
- result = '' \
- '%s' \
- % (normed_id, normed_id, len(self.footnote_ids))
- text = text[:start_idx] + result + text[p+1:]
- else:
- # This id isn't defined, leave the markup alone.
- curr_pos = p+1
- continue
-
- # Now determine what this is by the remainder.
- p += 1
- if p == text_length:
- return text
-
- # Inline anchor or img?
- if text[p] == '(': # attempt at perf improvement
- url, title, url_end_idx = self._extract_url_and_title(text, p)
- if url is not None:
- # Handle an inline anchor or img.
- is_img = start_idx > 0 and text[start_idx-1] == "!"
- if is_img:
- start_idx -= 1
-
- # We've got to encode these to avoid conflicting
- # with italics/bold.
- url = url.replace('*', self._escape_table['*']) \
- .replace('_', self._escape_table['_'])
- if title:
- title_str = ' title="%s"' % (
- _xml_escape_attr(title)
- .replace('*', self._escape_table['*'])
- .replace('_', self._escape_table['_']))
- else:
- title_str = ''
- if is_img:
- img_class_str = self._html_class_str_from_tag("img")
- result = '
= anchor_allowed_pos:
- safe_link = self._safe_protocols.match(url) or url.startswith('#')
- if self.safe_mode and not safe_link:
- result_head = '' % (title_str)
- else:
- result_head = '' % (_html_escape_url(url, safe_mode=self.safe_mode), title_str)
- result = '%s%s' % (result_head, link_text)
- if "smarty-pants" in self.extras:
- result = result.replace('"', self._escape_table['"'])
- #
allowed from curr_pos on, from
- # anchor_allowed_pos on.
- curr_pos = start_idx + len(result_head)
- anchor_allowed_pos = start_idx + len(result)
- text = text[:start_idx] + result + text[url_end_idx:]
- else:
- # Anchor not allowed here.
- curr_pos = start_idx + 1
- continue
-
- # Reference anchor or img?
- else:
- match = self._tail_of_reference_link_re.match(text, p)
- if match:
- # Handle a reference-style anchor or img.
- is_img = start_idx > 0 and text[start_idx-1] == "!"
- if is_img:
- start_idx -= 1
- link_id = match.group("id").lower()
- if not link_id:
- link_id = link_text.lower() # for links like [this][]
- if link_id in self.urls:
- url = self.urls[link_id]
- # We've got to encode these to avoid conflicting
- # with italics/bold.
- url = url.replace('*', self._escape_table['*']) \
- .replace('_', self._escape_table['_'])
- title = self.titles.get(link_id)
- if title:
- title = _xml_escape_attr(title) \
- .replace('*', self._escape_table['*']) \
- .replace('_', self._escape_table['_'])
- title_str = ' title="%s"' % title
- else:
- title_str = ''
- if is_img:
- img_class_str = self._html_class_str_from_tag("img")
- result = '
= anchor_allowed_pos:
- if self.safe_mode and not self._safe_protocols.match(url):
- result_head = '' % (title_str)
- else:
- result_head = '' % (_html_escape_url(url, safe_mode=self.safe_mode), title_str)
- result = '%s%s' % (result_head, link_text)
- if "smarty-pants" in self.extras:
- result = result.replace('"', self._escape_table['"'])
- #
allowed from curr_pos on, from
- # anchor_allowed_pos on.
- curr_pos = start_idx + len(result_head)
- anchor_allowed_pos = start_idx + len(result)
- text = text[:start_idx] + result + text[match.end():]
- else:
- # Anchor not allowed here.
- curr_pos = start_idx + 1
- else:
- # This id isn't defined, leave the markup alone.
- curr_pos = match.end()
- continue
-
- # Otherwise, it isn't markup.
- curr_pos = start_idx + 1
-
+ link_processor = LinkProcessor(self, None)
+ if link_processor.test(text):
+ text = link_processor.run(text)
return text
- def header_id_from_text(self, text, prefix, n):
+ def header_id_from_text(self,
+ text: str,
+ prefix: str,
+ n: Optional[int] = None
+ ) -> str:
"""Generate a header id attribute value from the given header
HTML content.
@@ -1525,13 +1572,13 @@ def header_id_from_text(self, text, prefix, n):
@param text {str} The text of the header tag
@param prefix {str} The requested prefix for header ids. This is the
value of the "header-ids" extra key, if any. Otherwise, None.
- @param n {int} The
tag.
+ @param n {int} (unused) The
tag.
@returns {str} The value for the header tag's "id" attribute. Return
None to not have an id attribute and to exclude this header from
the TOC (if the "toc" extra is specified).
"""
header_id = _slugify(text)
- if prefix and isinstance(prefix, base_string_type):
+ if prefix and isinstance(prefix, str):
header_id = prefix + '-' + header_id
self._count_from_header_id[header_id] += 1
@@ -1540,7 +1587,14 @@ def header_id_from_text(self, text, prefix, n):
return header_id
- def _toc_add_entry(self, level, id, name):
+ def _header_id_exists(self, text: str) -> bool:
+ header_id = _slugify(text)
+ prefix = self.extras['header-ids'].get('prefix')
+ if prefix and isinstance(prefix, str):
+ header_id = prefix + '-' + header_id
+ return header_id in self._count_from_header_id or header_id in map(lambda x: x[1], self._toc)
+
+ def _toc_add_entry(self, level: int, id: str, name: str) -> None:
if level > self._toc_depth:
return
if self._toc is None:
@@ -1548,12 +1602,12 @@ def _toc_add_entry(self, level, id, name):
self._toc.append((level, id, self._unescape_special_chars(name)))
_h_re_base = r'''
- (^(.+)[ \t]*\n(=+|-+)[ \t]*\n+)
+ (^(.+)[ \t]{0,99}\n(=+|-+)[ \t]*\n+)
|
(^(\#{1,6}) # \1 = string of #'s
[ \t]%s
(.+?) # \2 = Header text
- [ \t]*
+ [ \t]{0,99}
(? str:
+ '''Handles processing markdown headers'''
if match.group(1) is not None and match.group(3) == "-":
return match.group(1)
elif match.group(1) is not None:
@@ -1581,7 +1636,7 @@ def _h_sub(self, match):
header_id_attr = ""
if "header-ids" in self.extras:
header_id = self.header_id_from_text(header_group,
- self.extras["header-ids"], n)
+ self.extras["header-ids"].get('prefix'), n)
if header_id:
header_id_attr = ' id="%s"' % header_id
html = self._run_span_gamut(header_group)
@@ -1589,7 +1644,39 @@ def _h_sub(self, match):
self._toc_add_entry(n, header_id, html)
return "
tags.
"""
yield 0, ")
''' % (self.tab_width, self.tab_width),
re.M | re.X)
return code_block_re.sub(self._code_block_sub, text)
- _fenced_code_block_re = re.compile(r'''
- (?:\n+|\A\n?)
- ^```\s*?([\w+-]+)?\s*?\n # opening fence, $1 = optional lang
- (.*?) # $2 = code block content
- ^```[ \t]*\n # closing fence
- ''', re.M | re.X | re.S)
-
- def _fenced_code_block_sub(self, match):
- return self._code_block_sub(match, is_fenced_code_block=True)
-
- def _do_fenced_code_blocks(self, text):
- """Process ```-fenced unindented code blocks ('fenced-code-blocks' extra)."""
- return self._fenced_code_block_re.sub(self._fenced_code_block_sub, text)
-
# Rules for a code span:
# - backslash escapes are not interpreted in a code span
# - to include one or or a run of more backticks the delimiters must
@@ -1897,14 +1984,15 @@ def _do_fenced_code_blocks(self, text):
(? str:
c = match.group(2).strip(" \t")
c = self._encode_code(c)
- return ""
- for tup in inner:
- yield tup
+ yield from inner
yield 0, ""
- def wrap(self, source, outfile):
+ def _add_newline(self, inner):
+ # Add newlines around the inner contents so that _strict_tag_block_re matches the outer div.
+ yield 0, "\n"
+ yield from inner
+ yield 0, "\n"
+
+ def wrap(self, source, outfile=None):
"""Return the source with a code, pre, and div."""
- return self._wrap_div(self._wrap_pre(self._wrap_code(source)))
+ if outfile is None:
+ # pygments >= 2.12
+ return self._add_newline(self._wrap_pre(self._wrap_code(source)))
+ else:
+ # pygments < 2.12
+ return self._wrap_div(self._add_newline(self._wrap_pre(self._wrap_code(source))))
formatter_opts.setdefault("cssclass", "codehilite")
formatter = HtmlCodeFormatter(**formatter_opts)
return pygments.highlight(codeblock, lexer, formatter)
- def _code_block_sub(self, match, is_fenced_code_block=False):
- lexer_name = None
- if is_fenced_code_block:
- lexer_name = match.group(1)
- if lexer_name:
- formatter_opts = self.extras['fenced-code-blocks'] or {}
- codeblock = match.group(2)
- codeblock = codeblock[:-1] # drop one trailing newline
- else:
- codeblock = match.group(1)
- codeblock = self._outdent(codeblock)
- codeblock = self._detab(codeblock)
- codeblock = codeblock.lstrip('\n') # trim leading newlines
- codeblock = codeblock.rstrip() # trim trailing whitespace
-
- # Note: "code-color" extra is DEPRECATED.
- if "code-color" in self.extras and codeblock.startswith(":::"):
- lexer_name, rest = codeblock.split('\n', 1)
- lexer_name = lexer_name[3:].strip()
- codeblock = rest.lstrip("\n") # Remove lexer declaration line.
- formatter_opts = self.extras['code-color'] or {}
-
- # Use pygments only if not using the highlightjs-lang extra
- if lexer_name and "highlightjs-lang" not in self.extras:
- def unhash_code(codeblock):
- for key, sanitized in list(self.html_spans.items()):
- codeblock = codeblock.replace(key, sanitized)
- replacements = [
- ("&", "&"),
- ("<", "<"),
- (">", ">")
- ]
- for old, new in replacements:
- codeblock = codeblock.replace(old, new)
- return codeblock
- lexer = self._get_pygments_lexer(lexer_name)
- if lexer:
- codeblock = unhash_code( codeblock )
- colored = self._color_with_pygments(codeblock, lexer,
- **formatter_opts)
- return "\n\n%s\n\n" % colored
+ def _code_block_sub(self, match: re.Match[str]) -> str:
+ codeblock = match.group(1)
+ codeblock = self._outdent(codeblock)
+ codeblock = self._detab(codeblock)
+ codeblock = codeblock.lstrip('\n') # trim leading newlines
+ codeblock = codeblock.rstrip() # trim trailing whitespace
- codeblock = self._encode_code(codeblock)
pre_class_str = self._html_class_str_from_tag("pre")
+ code_class_str = self._html_class_str_from_tag("code")
- if "highlightjs-lang" in self.extras and lexer_name:
- code_class_str = ' class="%s"' % lexer_name
- else:
- code_class_str = self._html_class_str_from_tag("code")
+ codeblock = self._encode_code(codeblock)
- return "\n\n
\n\n" % (
+ return "\n%s\n
\n".format(
pre_class_str, code_class_str, codeblock)
- def _html_class_str_from_tag(self, tag):
+ def _html_class_str_from_tag(self, tag: str) -> str:
"""Get the appropriate ' class="..."' string (note the leading
space), if any, for the given tag.
"""
@@ -1845,11 +1944,13 @@ def _html_class_str_from_tag(self, tag):
except TypeError:
return ""
else:
- if tag in html_classes_from_tag:
- return ' class="%s"' % html_classes_from_tag[tag]
+ if isinstance(html_classes_from_tag, dict):
+ if tag in html_classes_from_tag:
+ return ' class="%s"' % html_classes_from_tag[tag]
return ""
- def _do_code_blocks(self, text):
+ @mark_stage(Stage.CODE_BLOCKS)
+ def _do_code_blocks(self, text: str) -> str:
"""Process Markdown `{}\n` blocks."""
code_block_re = re.compile(r'''
(?:\n\n|\A\n?)
@@ -1862,25 +1963,11 @@ def _do_code_blocks(self, text):
((?=^[ ]{0,%d}\S)|\Z) # Lookahead for non-space at line-start, or end of doc
# Lookahead to make sure this block isn't already in a code block.
# Needed when syntax highlighting is being used.
- (?![^<]*\)
+ (?!([^<]|<(/?)span)*\%s" % c
+ return "{}".format(self._html_class_str_from_tag("code"), c)
- def _do_code_spans(self, text):
+ @mark_stage(Stage.CODE_SPANS)
+ def _do_code_spans(self, text: str) -> str:
# * Backtick quotes are used for spans.
#
# * You can use multiple backticks as the delimiters if you want to
@@ -1929,7 +2017,7 @@ def _do_code_spans(self, text):
# ... type `bar` ...
return self._code_span_re.sub(self._code_span_sub, text)
- def _encode_code(self, text):
+ def _encode_code(self, text: str) -> str:
"""Encode/escape certain characters inside Markdown code runs.
The point is that in code, these characters are literals,
and lose their special Markdown meanings.
@@ -1945,84 +2033,19 @@ def _encode_code(self, text):
for before, after in replacements:
text = text.replace(before, after)
hashed = _hash_text(text)
- self._escape_table[text] = hashed
+ self._code_table[text] = hashed
return hashed
- _strike_re = re.compile(r"~~(?=\S)(.+?)(?<=\S)~~", re.S)
- def _do_strike(self, text):
- text = self._strike_re.sub(r"\1", text)
- return text
-
- _underline_re = re.compile(r"--(?=\S)(.+?)(?<=\S)--", re.S)
- def _do_underline(self, text):
- text = self._underline_re.sub(r"\1", text)
- return text
-
- _strong_re = re.compile(r"(\*\*|__)(?=\S)(.+?[*_]*)(?<=\S)\1", re.S)
- _em_re = re.compile(r"(\*|_)(?=\S)(.+?)(?<=\S)\1", re.S)
- _code_friendly_strong_re = re.compile(r"\*\*(?=\S)(.+?[*_]*)(?<=\S)\*\*", re.S)
- _code_friendly_em_re = re.compile(r"\*(?=\S)(.+?)(?<=\S)\*", re.S)
- def _do_italics_and_bold(self, text):
- # must go first:
- if "code-friendly" in self.extras:
- text = self._code_friendly_strong_re.sub(r"\1", text)
- text = self._code_friendly_em_re.sub(r"\1", text)
- else:
- text = self._strong_re.sub(r"\2", text)
- text = self._em_re.sub(r"\2", text)
- return text
-
- # "smarty-pants" extra: Very liberal in interpreting a single prime as an
- # apostrophe; e.g. ignores the fact that "round", "bout", "twer", and
- # "twixt" can be written without an initial apostrophe. This is fine because
- # using scare quotes (single quotation marks) is rare.
- _apostrophe_year_re = re.compile(r"'(\d\d)(?=(\s|,|;|\.|\?|!|$))")
- _contractions = ["tis", "twas", "twer", "neath", "o", "n",
- "round", "bout", "twixt", "nuff", "fraid", "sup"]
- def _do_smart_contractions(self, text):
- text = self._apostrophe_year_re.sub(r"’\1", text)
- for c in self._contractions:
- text = text.replace("'%s" % c, "’%s" % c)
- text = text.replace("'%s" % c.capitalize(),
- "’%s" % c.capitalize())
- return text
-
- # Substitute double-quotes before single-quotes.
- _opening_single_quote_re = re.compile(r"(?
- See "test/tm-cases/smarty_pants.text" for a full discussion of the
- support here and
- .+?
)', re.S)
- def _dedent_two_spaces_sub(self, match):
+ def _dedent_two_spaces_sub(self, match: re.Match[str]) -> str:
return re.sub(r'(?m)^ ', '', match.group(1))
- def _block_quote_sub(self, match):
+ def _block_quote_sub(self, match: re.Match[str]) -> str:
bq = match.group(1)
is_spoiler = 'spoiler' in self.extras and self._bq_all_lines_spoilers.match(bq)
# trim one level of quoting
@@ -2064,7 +2087,8 @@ def _block_quote_sub(self, match):
else:
return '\n%s\n
\n\n' % bq
- def _do_block_quotes(self, text):
+ @mark_stage(Stage.BLOCK_QUOTES)
+ def _do_block_quotes(self, text: str) -> str:
if '>' not in text:
return text
if 'spoiler' in self.extras:
@@ -2072,7 +2096,8 @@ def _do_block_quotes(self, text):
else:
return self._block_quote_re.sub(self._block_quote_sub, text)
- def _form_paragraphs(self, text):
+ @mark_stage(Stage.PARAGRAPHS)
+ def _form_paragraphs(self, text: str) -> str:
# Strip leading and trailing lines:
text = text.strip('\n')
@@ -2102,8 +2127,13 @@ def _form_paragraphs(self, text):
):
start = li.start()
cuddled_list = self._do_lists(graf[start:]).rstrip("\n")
- assert cuddled_list.startswith("") or cuddled_list.startswith("
")
- graf = graf[:start]
+ if re.match(r'^<(?:ul|ol).*?>', cuddled_list):
+ graf = graf[:start]
+ else:
+ # Not quite a cuddled list. (See not_quite_a_list_cuddled_lists test case)
+ # Store as a simple paragraph.
+ graf = cuddled_list
+ cuddled_list = None
# Wrap
` tag
+
+ Args:
+ url: the image URL/src
+ title_attr: a string containing the title attribute of the tag (eg: `' title="..."'`)
+ link_text: the human readable text portion of the link
+
+ Returns:
+ A tuple containing:
+
+ 1. The HTML string
+ 2. The length of the opening HTML tag in the string. For `
` it's the whole string.
+ This section will be skipped by the link processor
+ '''
+ img_class_str = self.md._html_class_str_from_tag("img")
+ result = (
+ f'
Tuple[str, int]:
+ '''
+ Takes a URL, title and link text and returns an HTML `` tag
+
+ Args:
+ url: the URL
+ title_attr: a string containing the title attribute of the tag (eg: `' title="..."'`)
+ link_text: the human readable text portion of the link
+
+ Returns:
+ A tuple containing:
+
+ 1. The HTML string
+ 2. The length of the opening HTML tag in the string. This section will be skipped
+ by the link processor
+ '''
+ if self.md.safe_mode and not self.md._safe_href.match(url):
+ result_head = f''
+ else:
+ result_head = f''
+
+ return f'{result_head}{link_text}', len(result_head)
+
+ def run(self, text: str):
+ MAX_LINK_TEXT_SENTINEL = 3000 # markdown2 issue 24
+
+ # `anchor_allowed_pos` is used to support img links inside
+ # anchors, but not anchors inside anchors. An anchor's start
+ # pos must be `>= anchor_allowed_pos`.
+ anchor_allowed_pos = 0
+
+ curr_pos = 0
+
+ while True:
+ # The next '[' is the start of:
+ # - an inline anchor: [text](url "title")
+ # - a reference anchor: [text][id]
+ # - an inline img: 
+ # - a reference img: ![text][id]
+ # - a footnote ref: [^id]
+ # (Only if 'footnotes' extra enabled)
+ # - a footnote defn: [^id]: ...
+ # (Only if 'footnotes' extra enabled) These have already
+ # been stripped in _strip_footnote_definitions() so no
+ # need to watch for them.
+ # - a link definition: [id]: url "title"
+ # These have already been stripped in
+ # _strip_link_definitions() so no need to watch for them.
+ # - not markup: [...anything else...
+ try:
+ start_idx = text.index('[', curr_pos)
+ except ValueError:
+ break
+
+ # Find the matching closing ']'.
+ # Markdown.pl allows *matching* brackets in link text so we
+ # will here too. Markdown.pl *doesn't* currently allow
+ # matching brackets in img alt text -- we'll differ in that
+ # regard.
+ bracket_depth = 0
+
+ for p in range(
+ start_idx + 1,
+ min(start_idx + MAX_LINK_TEXT_SENTINEL, len(text))
+ ):
+ ch = text[p]
+ if ch == ']':
+ bracket_depth -= 1
+ if bracket_depth < 0:
+ break
+ elif ch == '[':
+ bracket_depth += 1
+ else:
+ # Closing bracket not found within sentinel length.
+ # This isn't markup.
+ curr_pos = start_idx + 1
+ continue
+ link_text = text[start_idx + 1: p]
+
+ # Fix for issue 341 - Injecting XSS into link text
+ if self.md.safe_mode:
+ link_text = self.md._hash_html_spans(link_text)
+ link_text = self.md._unhash_html_spans(link_text)
+
+ # Possibly a footnote ref?
+ if "footnotes" in self.md.extras and link_text.startswith("^"):
+ normed_id = re.sub(r'\W', '-', link_text[1:])
+ if normed_id in self.md.footnotes:
+ result = (
+ f''
+ # insert special footnote marker that's easy to find and match against later
+ f'{self.md._footnote_marker}-{normed_id}'
+ )
+ text = text[:start_idx] + result + text[p+1:]
+ else:
+ # This id isn't defined, leave the markup alone.
+ curr_pos = p + 1
+ continue
+
+ # Now determine what this is by the remainder.
+ p += 1
+
+ # -- Extract the URL, title and end index from the link
+
+ # inline anchor or inline img
+ if text[p:p + 1] == '(':
+ if not self.options.get('inline', True):
+ curr_pos = start_idx + 1
+ continue
+
+ parsed = self.parse_inline_anchor_or_image(text, link_text, p)
+ if not parsed:
+ # text isn't markup
+ curr_pos = start_idx + 1
+ continue
+
+ text, url, title, url_end_idx = parsed
+ url = self.md._unhash_html_spans(url, code=True)
+ # reference anchor or reference img
+ else:
+ if not self.options.get('ref', True):
+ curr_pos = start_idx + 1
+ continue
+
+ parsed = self.parse_ref_anchor_or_ref_image(text, link_text, p)
+ if not parsed:
+ curr_pos = start_idx + 1
+ continue
+
+ text, url, title, url_end_idx = parsed
+ if url is None:
+ # This id isn't defined, leave the markup alone.
+ # set current pos to end of link title and continue from there
+ curr_pos = p
+ continue
+
+ # -- Encode and hash the URL and title to avoid conflicts with italics/bold
+
+ url = (
+ url
+ .replace('*', self.md._escape_table['*'])
+ .replace('_', self.md._escape_table['_'])
+ )
+ if title:
+ if self.md.safe_mode:
+ # expose span contents for escaping - fix #691, #703
+ title = self.md._unhash_html_spans(title, spans=True, code=True)
+ title = (
+ _xml_escape_attr(title)
+ .replace('*', self.md._escape_table['*'])
+ .replace('_', self.md._escape_table['_'])
+ )
+ title_str = f' title="{title}"'
+ else:
+ title_str = ''
+
+ # -- Process the anchor/image
+
+ is_img = start_idx > 0 and text[start_idx-1] == "!"
+ if is_img:
+ if 'img' not in self.options.get('tags', ['img']):
+ curr_pos = start_idx + 1
+ continue
+
+ if link_text and self.md.safe_mode:
+ # expose span contents for escaping - fix #699, #703
+ link_text = self.md._unhash_html_spans(link_text, spans=True, code=True)
+
+ start_idx -= 1
+ result, skip = self.process_image(url, title_str, link_text)
+ elif start_idx >= anchor_allowed_pos:
+ if 'a' not in self.options.get('tags', ['a']):
+ curr_pos = start_idx + 1
+ continue
+
+ result, skip = self.process_anchor(url, title_str, link_text)
+ else:
+ # anchor not allowed here/invalid markup
+ curr_pos = start_idx + 1
+ continue
+
+ if "smarty-pants" in self.md.extras:
+ result = result.replace('"', self.md._escape_table['"'])
+
+ #
allowed from curr_pos onwards, allowed from anchor_allowed_pos onwards.
+ # this means images can exist within `` tags but anchors can only come after the
+ # current anchor has been closed
+ curr_pos = start_idx + skip
+ anchor_allowed_pos = start_idx + len(result)
+ text = text[:start_idx] + result + text[url_end_idx:]
+
+ return text
+
+ def test(self, text: str):
+ return '(' in text or '[' in text
+
+
+# User facing extras
+# ----------------------------------------------------------
+
+
+class Admonitions(Extra):
+ '''
+ Enable parsing of RST admonitions
+ '''
+
+ name = 'admonitions'
+ order = (Stage.BLOCK_GAMUT, Stage.LINK_DEFS), ()
+
+ admonitions = r'admonition|attention|caution|danger|error|hint|important|note|tip|warning'
+
+ admonitions_re = re.compile(r'''
+ ^(\ *)\.\.\ (%s)::\ * # $1 leading indent, $2 the admonition
+ (.*)? # $3 admonition title
+ ((?:\s*\n\1\ {3,}.*)+?) # $4 admonition body (required)
+ (?=\s*(?:\Z|\n{4,}|\n\1?\ {0,2}\S)) # until EOF, 3 blank lines or something less indented
+ ''' % admonitions,
+ re.IGNORECASE | re.MULTILINE | re.VERBOSE
+ )
+
+ def test(self, text: str):
+ return self.admonitions_re.search(text) is not None
+
+ def sub(self, match: re.Match[str]) -> str:
+ lead_indent, admonition_name, title, body = match.groups()
+
+ admonition_type = '%s' % admonition_name
+
+ # figure out the class names to assign the block
+ if admonition_name.lower() == 'admonition':
+ admonition_class = 'admonition'
+ else:
+ admonition_class = 'admonition %s' % admonition_name.lower()
+
+ # titles are generally optional
+ if title:
+ title = '%s' % title
+
+ # process the admonition body like regular markdown
+ body = self.md._run_block_gamut("\n%s\n" % self.md._uniform_outdent(body)[1])
+
+ # indent the body before placing inside the aside block
+ admonition = self.md._uniform_indent(
+ '{}\n{}\n\n{}\n'.format(admonition_type, title, body),
+ self.md.tab, False
+ )
+ # wrap it in an aside
+ admonition = ''.format(admonition_class, admonition)
+ # now indent the whole admonition back to where it started
+ return self.md._uniform_indent(admonition, lead_indent, False)
+
+ def run(self, text: str):
+ return self.admonitions_re.sub(self.sub, text)
+
+
+class Alerts(Extra):
+ '''
+ Markdown Alerts as per
+ https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax#alerts
+ '''
+
+ name = 'alerts'
+ order = (), (Stage.BLOCK_QUOTES, )
+
+ alert_re = re.compile(r'''
+
\s*
+
+ ''', re.X
+ )
+
+ def test(self, text: str):
+ return "" in text
+
+ def sub(self, match: re.Match[str]) -> str:
+ typ = match["type"].lower()
+ heading = f"{match['type'].title()}"
+ contents = match["contents"].strip()
+ if match["closing_tag"]:
+ return f'
'''
+ on_newline: bool
+ '''Replace single new line characters with
when True'''
+
+
+class Breaks(Extra):
+ name = 'breaks'
+ order = (), (Stage.ITALIC_AND_BOLD,)
+ options: _BreaksExtraOpts
+
+ def run(self, text: str):
+ on_backslash = self.options.get('on_backslash', False)
+ on_newline = self.options.get('on_newline', False)
+
+ if on_backslash and on_newline:
+ pattern = r' *\\?'
+ elif on_backslash:
+ pattern = r'(?: *\\| {2,})'
+ elif on_newline:
+ pattern = r' *'
+ else:
+ pattern = r' {2,}'
+
+ break_tag = "
)", break_tag, text)
+
+ return text
+
+
+class CodeFriendly(GFMItalicAndBoldProcessor):
+ '''
+ Disable _ and __ for em and strong.
+ '''
+ name = 'code-friendly'
+ order = (Stage.ITALIC_AND_BOLD,), (Stage.ITALIC_AND_BOLD,)
+
+ def __init__(self, md, options):
+ super().__init__(md, options)
+ self.hash_table = {}
+
+ def run(self, text: str):
+ if self.md.order < Stage.ITALIC_AND_BOLD:
+ text = super().run(text)
+ else:
+ orig_text = ''
+ while orig_text != text:
+ orig_text = text
+ for key, substr in self.hash_table.items():
+ text = text.replace(key, substr)
+ return text
+
+ def process_span(
+ self, open: re.Match[str], close: re.Match[str], middle: Optional[re.Match[str]] = None,
+ open_syntax: Optional[str] = None, close_syntax: Optional[str] = None
+ ):
+ text = open.string[open.start(): close.end()]
+ open_syntax = open_syntax or cast(str, open.group(1))
+ close_syntax = close_syntax or cast(str, close.group(1))
+
+ if '_' in open_syntax:
+ # if using _this_ syntax, hash it to avoid processing, but don't hash the contents incase of nested syntax
+ text = re.sub(r'_+', lambda m: self._hash_text(m.group(0)), text)
+ return [text], None
+ elif '_' in text:
+ # if the text within the bold/em markers contains '_' then hash those chars to protect them from em_re
+ text = (
+ text[len(open_syntax): -len(close_syntax)]
+ .replace('__', self._hash_text('__'))
+ .replace('_', self._hash_text('_'))
+ )
+ return [open_syntax, text, close_syntax], None
+
+ return super().process_span(open, close, middle, open_syntax, close_syntax)
+
+ def _hash_text(self, text: str):
+ '''
+ Wrapper around `_hash_text` that updates the entries in `self.hash_table`
+ '''
+ # perf: avoid calling _hash_text if we can. Lookup in existing hash table
+ if text in self.hash_table.values():
+ return tuple(self.hash_table.keys())[tuple(self.hash_table.values()).index(text)]
+
+ # add a prefix to it so we don't interfere with escaped/hashed chars from other stages
+ hashed = _hash_text(self.name + text)
+ self.hash_table[hashed] = text
+ return hashed
+
+ def test(self, text: str):
+ return super().test(text) or (
+ self.hash_table and re.search(r'md5-[0-9a-z]{32}', text)
+ )
+
+
+class FencedCodeBlocks(Extra):
+ '''
+ Allows a code block to not have to be indented
+ by fencing it with '```' on a line before and after. Based on
+
')
+
+ def sub(self, match: re.Match[str]) -> str:
+ lexer_name = match.group(2)
+ codeblock = match.group(3)
+ codeblock = codeblock[:-1] # drop one trailing newline
+
+ # figure out what newlines were already surrounding the code block and preserve them in the output
+ leading_newlines = match.string[match.start(): match.regs[1][0]]
+ trailing_newlines = re.search(r'\n*$', match.group()).group()
+
+ # Use pygments only if not using the highlightjs-lang extra
+ if lexer_name and "highlightjs-lang" not in self.md.extras:
+ lexer = self.md._get_pygments_lexer(lexer_name)
+ if lexer:
+ leading_indent = ' ' * (len(match.group(1)) - len(match.group(1).lstrip()))
+ return (
+ leading_newlines
+ + self._code_block_with_lexer_sub(codeblock, leading_indent, lexer)
+ + trailing_newlines
+ )
+
+ # Fenced code blocks need to be outdented before encoding, and then reapplied
+ leading_indent = ' ' * (len(match.group(1)) - len(match.group(1).lstrip()))
+ if codeblock:
+ # only run the codeblock through the outdenter if not empty
+ leading_indent, codeblock = self.md._uniform_outdent(codeblock, max_outdent=leading_indent)
+
+ codeblock = self.md._encode_code(codeblock)
+
+ tags = self.tags(lexer_name)
+
+ return (
+ f'{leading_newlines}'
+ f'{leading_indent}{tags[0]}'
+ f'{codeblock}'
+ f'\n{leading_indent}{tags[1]}'
+ f'{trailing_newlines}'
+ )
+
+ def run(self, text: str):
+ return self.fenced_code_block_re.sub(self.sub, text)
+
+
+class Latex(Extra):
+ '''
+ Convert $ and $$ to tags for inline and block math.
+ '''
+ name = 'latex'
+ order = (Stage.CODE_BLOCKS, FencedCodeBlocks), ()
+
+ _single_dollar_re = re.compile(r'(?(.*?)", re.DOTALL) # Wraped in '.format(pre_class, code_class), '
+ _triple_re = re.compile(r'```(.*?)```', re.DOTALL) # Wrapped in a code block ```
+ _single_re = re.compile(r'(?"
+ self.code_blocks[placeholder] = match.group(0)
+ return placeholder
+
+ def run(self, text: str):
+ try:
+ import latex2mathml.converter
+ self.converter = latex2mathml.converter
+ except ImportError:
+ raise ImportError('The "latex" extra requires the "latex2mathml" package to be installed.')
+
+ # Escape by replacing with a code block
+ text = self._pre_code_block_re.sub(self.code_placeholder, text)
+ text = self._single_re.sub(self.code_placeholder, text)
+ text = self._triple_re.sub(self.code_placeholder, text)
+
+ text = self._single_dollar_re.sub(self._convert_single_match, text)
+ text = self._double_dollar_re.sub(self._convert_double_match, text)
+
+ # Convert placeholder tag back to original code
+ for placeholder, code_block in self.code_blocks.items():
+ text = text.replace(placeholder, code_block)
+
+ return text
+
+
+class LinkPatterns(Extra):
+ '''
+ Auto-link given regex patterns in text (e.g. bug number
+ references, revision number references).
+ '''
+ name = 'link-patterns'
+ order = (Stage.LINKS,), ()
+ options: _link_patterns
+
+ _basic_link_re = re.compile(r'!?\[.*?\]\(.*?\)')
+
+ def run(self, text: str):
+ link_from_hash = {}
+ for regex, repl in self.options:
+ replacements = []
+ for match in regex.finditer(text):
+ if any(self.md._match_overlaps_substr(text, match, h) for h in link_from_hash):
+ continue
+
+ if callable(repl):
+ href = repl(match)
+ else:
+ href = match.expand(repl)
+ replacements.append((match.span(), href))
+ for (start, end), href in reversed(replacements):
+
+ # Do not match against links inside brackets.
+ if text[start - 1:start] == '[' and text[end:end + 1] == ']':
+ continue
+
+ # Do not match against links in the standard markdown syntax.
+ if text[start - 2:start] == '](' or text[end:end + 2] == '")':
+ continue
+
+ # Do not match against links which are escaped.
+ if text[start - 3:start] == '"""' and text[end:end + 3] == '"""':
+ text = text[:start - 3] + text[start:end] + text[end + 3:]
+ continue
+
+ # search the text for anything that looks like a link
+ is_inside_link = False
+ for link_re in (self.md._auto_link_re, self._basic_link_re):
+ for match in link_re.finditer(text):
+ if any((r[0] <= start and end <= r[1]) for r in match.regs):
+ # if the link pattern start and end pos is within the bounds of
+ # something that looks like a link, then don't process it
+ is_inside_link = True
+ break
+ else:
+ continue
+ break
+
+ if is_inside_link:
+ continue
+
+ escaped_href = (
+ href.replace('"', '"') # b/c of attr quote
+ # To avoid markdown and :
+ .replace('*', self.md._escape_table['*'])
+ .replace('_', self.md._escape_table['_']))
+ link = '{}'.format(escaped_href, text[start:end])
+ hash = self.md._hash_span(link, link_from_hash)
+ text = text[:start] + hash + text[end:]
+ for hash, link in list(link_from_hash.items()):
+ text = text.replace(hash, link)
+ return text
+
+ def test(self, text: str):
+ return True
+
+
+class MarkdownInHTML(Extra):
+ '''
+ Allow the use of `markdown="1"` in a block HTML tag to
+ have markdown processing be done on its contents. Similar to
+
')
+ return super().tags(lexer_name)
+
+
+class MiddleWordEm(GFMItalicAndBoldProcessor):
+ '''
+ Allows or disallows emphasis syntax in the middle of words,
+ defaulting to allow. Disabling this means that `this_text_here` will not be
+ converted to `thistexthere`.
+ '''
+ name = 'middle-word-em'
+ order = (CodeFriendly,), (Stage.ITALIC_AND_BOLD,)
+
+ def __init__(self, md: Markdown, options: Union[dict, bool, None]):
+ '''
+ Args:
+ md: the markdown instance
+ options: can be bool for backwards compatibility but will be converted to a dict
+ in the constructor. All options are:
+ - allowed (bool): whether to allow emphasis in the middle of a word.
+ If `options` is a bool it will be placed under this key.
+ '''
+ if isinstance(options, bool):
+ options = {'allowed': options}
+ else:
+ options = options or {}
+ options.setdefault('allowed', True)
+ super().__init__(md, options)
+
+ escaped_hashes = '|'.join(md._escape_table.values())
+
+ self.middle_word_em_re = re.compile(
+ r'''
+ (? self.md.stage:
+ text = text.replace(self.hash_table['_'], '_')
+ text = text.replace(self.hash_table['*'], '*')
+
+ return text
+
+ def sub(self, match: re.Match[str]):
+ if match.re != self.middle_word_em_re:
+ return super().sub(match)
+
+ syntax = match.group(1)
+ return self.hash_table[syntax]
+
+ def test(self, text: str):
+ return super().test(text) or (
+ self.hash_table and re.search(r'md5-[0-9a-z]{32}', text)
+ )
+
+
+class Numbering(Extra):
+ '''
+ Support of generic counters. Non standard extension to
+ allow sequential numbering of figures, tables, equations, exhibits etc.
+ '''
+
+ name = 'numbering'
+ order = (Stage.LINK_DEFS,), ()
+
+ def run(self, text: str):
+ # First pass to define all the references
+ regex_defns = re.compile(r'''
+ \[\#(\w+) # the counter. Open square plus hash plus a word \1
+ ([^@]*) # Some optional characters, that aren't an @. \2
+ @(\w+) # the id. Should this be normed? \3
+ ([^\]]*)\] # The rest of the text up to the terminating ] \4
+ ''', re.VERBOSE)
+ regex_subs = re.compile(r"\[@(\w+)\s*\]") # [@ref_id]
+ counters = {}
+ references = {}
+ replacements = []
+ definition_html = '
+ blocks.
+ '''
+
+ name = 'pyshell'
+ order = (), (Stage.LISTS,)
+
+ def test(self, text: str):
+ return ">>>" in text
+
+ def sub(self, match: re.Match[str]) -> str:
+ if "fenced-code-blocks" in self.md.extras:
+ dedented = _dedent(match.group(0))
+ return self.md.extra_classes['fenced-code-blocks'].run("```pycon\n" + dedented + "```\n")
+
+ lines = match.group(0).splitlines(0)
+ _dedentlines(lines)
+ indent = ' ' * self.md.tab_width
+ s = ('\n' # separate from possible cuddled paragraph
+ + indent + ('\n'+indent).join(lines)
+ + '\n')
+ return s
+
+ def run(self, text: str):
+ less_than_tab = self.md.tab_width - 1
+ _pyshell_block_re = re.compile(r"""
+ ^([ ]{0,%d})>>>[ ].*\n # first line
+ ^(\1[^\S\n]*\S.*\n)* # any number of subsequent lines with at least one character
+ (?=^\1?\n|\Z) # ends with a blank line or end of document
+ """ % less_than_tab, re.M | re.X)
+
+ return _pyshell_block_re.sub(self.sub, text)
+
+
+class SmartyPants(Extra):
+ '''
+ Replaces ' and " with curly quotation marks or curly
+ apostrophes. Replaces --, ---, ..., and . . . with en dashes, em dashes,
+ and ellipses.
+ '''
+ name = 'smarty-pants'
+ order = (), (Stage.SPAN_GAMUT,)
+
+ _opening_single_quote_re = re.compile(r"(? str:
+ text = self._apostrophe_year_re.sub(r"’\1", text)
+ for c in self._contractions:
+ text = text.replace("'%s" % c, "’%s" % c)
+ text = text.replace("'%s" % c.capitalize(),
+ "’%s" % c.capitalize())
+ return text
+
+ def run(self, text: str):
+ """Fancifies 'single quotes', "double quotes", and apostrophes.
+ Converts --, ---, and ... into en dashes, em dashes, and ellipses.
+
+ Inspiration is: \1", text)
+
+ def test(self, text: str):
+ return '~~' in text
+
+
+class Tables(Extra):
+ '''
+ Tables using the same format as GFM
+ ']
+ cols = [re.sub(escape_bar_re, '|', cell.strip()) for cell in re.split(split_bar_re, re.sub(trim_bar_re, "", re.sub(trim_space_re, "", head)))]
+ for col_idx, col in enumerate(cols):
+ hlines.append(' ')
+ hlines.append('')
+
+ # tbody
+ body = body.strip('\n')
+ if body:
+ hlines.append('')
+ for line in body.split('\n'):
+ hlines.append('{} '.format(
+ align_from_col_idx.get(col_idx, ''),
+ self.md._run_span_gamut(col)
+ ))
+ hlines.append('')
+ cols = [re.sub(escape_bar_re, '|', cell.strip()) for cell in re.split(split_bar_re, re.sub(trim_bar_re, "", re.sub(trim_space_re, "", line)))]
+ for col_idx, col in enumerate(cols):
+ hlines.append(' ')
+ hlines.append('')
+ hlines.append('')
+
+ return '\n'.join(hlines) + '\n'
+
+
+class TelegramSpoiler(Extra):
+ name = 'tg-spoiler'
+ order = (), (Stage.ITALIC_AND_BOLD,)
+
+ _tg_spoiler_re = re.compile(r"\|\|\s?(.+?)\s?\|\|", re.S)
+
+ def run(self, text: str):
+ return self._tg_spoiler_re.sub(r"{} '.format(
+ align_from_col_idx.get(col_idx, ''),
+ self.md._run_span_gamut(col)
+ ))
+ hlines.append('', 2)
+ for cell in rows[0]:
+ add_hline(f" ', 2)
+ add_hline('', 1)
+ # Only one header row allowed.
+ rows = rows[1:]
+ # If no more rows, don't create a tbody.
+ if rows:
+ add_hline('', 1)
+ for row in rows:
+ add_hline('{format_cell(cell)} ", 3)
+ add_hline('', 2)
+ for cell in row:
+ add_hline(f' ', 2)
+ add_hline('', 1)
+ add_hline('')
+ return '\n'.join(hlines) + '\n'
+
+ def test(self, text):
+ return '||' in text
+
+
+# Register extras
+Admonitions.register()
+Alerts.register()
+Breaks.register()
+CodeFriendly.register()
+FencedCodeBlocks.register()
+Latex.register()
+LinkPatterns.register()
+MarkdownInHTML.register()
+MarkdownFileLinks.register()
+MiddleWordEm.register()
+Mermaid.register()
+Numbering.register()
+PyShell.register()
+SmartyPants.register()
+Strike.register()
+Tables.register()
+TelegramSpoiler.register()
+Underline.register()
+Wavedrom.register()
+WikiTables.register()
+
+
+# ----------------------------------------------------------
+
+
+# ---- internal support functions
+
+
+def calculate_toc_html(toc: Union[list[tuple[int, str, str]], None]) -> Optional[str]:
+ """Return the HTML for the current TOC.
+
+ This expects the `_toc` attribute to have been set on this instance.
+ """
+ if toc is None:
+ return None
def indent():
return ' ' * (len(h_stack) - 1)
@@ -2326,7 +4372,7 @@ def indent():
if not lines[-1].endswith("{format_cell(cell)} ', 3)
+ add_hline('
EOL
@@ -107,12 +101,12 @@ def _markdown_from_aspn_html(html):
title = None
escaped_href = href.replace('(', '\\(').replace(')', '\\)')
if title is None:
- replacement = '[%s](%s)' % (content, escaped_href)
+ replacement = '[{}]({})'.format(content, escaped_href)
else:
- replacement = '[%s](%s "%s")' % (content, escaped_href,
+ replacement = '[{}]({} "{}")'.format(content, escaped_href,
title.replace('"', "'"))
markdown = markdown[:start] + replacement + markdown[end:]
-
+
markdown = markdown.replace(" ", ' ')
# part 1: Pull out
-blocks and put in placeholders
@@ -137,7 +131,7 @@ def _markdown_from_aspn_html(html):
try:
idx = markdown.index(marker)
except ValueError:
- print "marker: %r" % marker
+ print("marker: %r" % marker)
raise
if not markdown[:idx].strip():
#TODO: Correct this false diagnosis. Problem is not limited
@@ -182,20 +176,20 @@ def _markdown_from_aspn_html(html):
# Recipe: dedent (0.1.2)
def _dedentlines(lines, tabsize=8, skip_first_line=False):
"""_dedentlines(lines, tabsize=8, skip_first_line=False) -> dedented lines
-
+
"lines" is a list of lines to dedent.
"tabsize" is the tab width to use for indent width calculations.
"skip_first_line" is a boolean indicating if the first line should
be skipped for calculating the indent width and for dedenting.
This is sometimes useful for docstrings and similar.
-
+
Same as dedent() except operates on a sequence of lines. Note: the
lines list is modified **in-place**.
"""
DEBUG = False
- if DEBUG:
- print "dedent: dedent(..., tabsize=%d, skip_first_line=%r)"\
- % (tabsize, skip_first_line)
+ if DEBUG:
+ print("dedent: dedent(..., tabsize=%d, skip_first_line=%r)"\
+ % (tabsize, skip_first_line))
indents = []
margin = None
for i, line in enumerate(lines):
@@ -212,12 +206,12 @@ def _dedentlines(lines, tabsize=8, skip_first_line=False):
break
else:
continue # skip all-whitespace lines
- if DEBUG: print "dedent: indent=%d: %r" % (indent, line)
+ if DEBUG: print("dedent: indent=%d: %r" % (indent, line))
if margin is None:
margin = indent
else:
margin = min(margin, indent)
- if DEBUG: print "dedent: margin=%r" % margin
+ if DEBUG: print("dedent: margin=%r" % margin)
if margin is not None and margin > 0:
for i, line in enumerate(lines):
@@ -229,7 +223,7 @@ def _dedentlines(lines, tabsize=8, skip_first_line=False):
elif ch == '\t':
removed += tabsize - (removed % tabsize)
elif ch in '\r\n':
- if DEBUG: print "dedent: %r: EOL -> strip up to EOL" % line
+ if DEBUG: print("dedent: %r: EOL -> strip up to EOL" % line)
lines[i] = lines[i][j:]
break
else:
@@ -237,8 +231,8 @@ def _dedentlines(lines, tabsize=8, skip_first_line=False):
"line %r while removing %d-space margin"
% (ch, line, margin))
if DEBUG:
- print "dedent: %r: %r -> removed %d/%d"\
- % (line, ch, removed, margin)
+ print("dedent: %r: %r -> removed %d/%d"\
+ % (line, ch, removed, margin))
if removed == margin:
lines[i] = lines[i][j+1:]
break
@@ -258,7 +252,7 @@ def _dedent(text, tabsize=8, skip_first_line=False):
"skip_first_line" is a boolean indicating if the first line should
be skipped for calculating the indent width and for dedenting.
This is sometimes useful for docstrings and similar.
-
+
textwrap.dedent(s), but don't expand tabs to spaces
"""
lines = text.splitlines(1)
diff --git a/perf/perf.py b/perf/perf.py
index ea6f10ea..ed8bd864 100755
--- a/perf/perf.py
+++ b/perf/perf.py
@@ -10,7 +10,6 @@
import os
import sys
-import timeit
import time
from os.path import *
from glob import glob
@@ -35,7 +34,7 @@ def time_markdown_py(cases_dir, repeat):
for i in range(repeat):
start = clock()
for path in glob(join(cases_dir, "*.text")):
- f = open(path, 'r')
+ f = open(path)
content = f.read()
f.close()
try:
@@ -45,7 +44,7 @@ def time_markdown_py(cases_dir, repeat):
pass
end = clock()
times.append(end - start)
- print " markdown.py: best of %d: %.3fs" % (repeat, min(times))
+ print(" markdown.py: best of %d: %.3fs" % (repeat, min(times)))
@hotshotit
def hotshot_markdown2_py(cases_dir, repeat):
@@ -60,13 +59,13 @@ def time_markdown2_py(cases_dir, repeat):
for i in range(repeat):
start = clock()
for path in glob(join(cases_dir, "*.text")):
- f = open(path, 'r')
+ f = open(path)
content = f.read()
f.close()
markdowner.convert(content)
end = clock()
times.append(end - start)
- print " markdown2.py: best of %d: %.3fs" % (repeat, min(times))
+ print(" markdown2.py: best of %d: %.3fs" % (repeat, min(times)))
def time_markdown_pl(cases_dir, repeat):
times = []
@@ -75,7 +74,7 @@ def time_markdown_pl(cases_dir, repeat):
os.system('perl time_markdown_pl.pl "%s"' % cases_dir)
end = clock()
times.append(end - start)
- print " Markdown.pl: best of %d: %.3fs" % (repeat, min(times))
+ print(" Markdown.pl: best of %d: %.3fs" % (repeat, min(times)))
def time_all(cases_dir, repeat):
time_markdown_pl(cases_dir, repeat=repeat)
@@ -131,10 +130,10 @@ def main(args=sys.argv):
if timer_name not in d:
raise ValueError("no '%s' timer function" % timer_name)
timer = d[timer_name]
- print "Profile conversion of %s (plat=%s):" \
- % (os.path.join(cases_dir, "*.text"), sys.platform)
+ print("Profile conversion of %s (plat=%s):" \
+ % (os.path.join(cases_dir, "*.text"), sys.platform))
timer(cases_dir, repeat=opts.repeat)
- print
+ print()
os.system("python show_stats.py %s.prof" % timer_name)
else:
@@ -145,8 +144,8 @@ def main(args=sys.argv):
if timer_name not in d:
raise ValueError("no '%s' timer function" % timer_name)
timer = d[timer_name]
- print "Time conversion of %s (plat=%s):" \
- % (os.path.join(cases_dir, "*.text"), sys.platform)
+ print("Time conversion of %s (plat=%s):" \
+ % (os.path.join(cases_dir, "*.text"), sys.platform))
timer(cases_dir, repeat=opts.repeat)
if __name__ == "__main__":
diff --git a/perf/strip_cookbook_data.py b/perf/strip_cookbook_data.py
index 0f4abed0..94ba3048 100644
--- a/perf/strip_cookbook_data.py
+++ b/perf/strip_cookbook_data.py
@@ -1,11 +1,10 @@
-
from os.path import *
-from pprint import pprint, pformat
-import datetime
+from pprint import pformat
def doit():
recipes_path = expanduser("recipes.pprint")
- recipe_dicts = eval(open(recipes_path).read())
+ with open(recipes_path) as f:
+ recipe_dicts = eval(f.read())
for r in recipe_dicts:
for key in r.keys():
if key not in ('desc', 'comments'):
@@ -14,11 +13,10 @@ def doit():
for key in c.keys():
if key not in ('comment', 'title'):
del c[key]
-
+
f = open("stripped.pprint", 'w')
f.write(pformat(recipe_dicts))
f.close()
doit()
-
diff --git a/perf/util.py b/perf/util.py
index 05a22a0c..7fcc862f 100644
--- a/perf/util.py
+++ b/perf/util.py
@@ -3,15 +3,7 @@
"""Perf utility functions"""
-import os
-from os.path import basename
import sys
-import md5
-import re
-import stat
-import textwrap
-import types
-from pprint import pprint, pformat
# Global dict for holding specific hotshot profilers
@@ -38,14 +30,14 @@ def wrapper(*args, **kw):
return func(*args, **kw)
finally:
total_time = clock() - start_time
- print "%s took %.3fs" % (func.func_name, total_time)
+ print("{} took {:.3f}s".format(func.__name__, total_time))
return wrapper
def hotshotit(func):
def wrapper(*args, **kw):
import hotshot
global hotshotProfilers
- prof_name = func.func_name+".prof"
+ prof_name = func.__name__+".prof"
profiler = hotshotProfilers.get(prof_name)
if profiler is None:
profiler = hotshot.Profile(prof_name)
diff --git a/sandbox/wiki.py b/sandbox/wiki.py
index ac27199a..7da4a8bc 100644
--- a/sandbox/wiki.py
+++ b/sandbox/wiki.py
@@ -1,9 +1,8 @@
-
import sys
import re
-from os.path import *
+from os.path import dirname, abspath
-sys.path.insert(0, dirname(dirname(abspath(__file__))))
+sys.path.insert(0, dirname(dirname(abspath(__file__))) + '/lib')
import markdown2
wiki_page = """
@@ -18,4 +17,4 @@
]
processor = markdown2.Markdown(extras=["link-patterns"],
link_patterns=link_patterns)
-print processor.convert(wiki_page)
+print(processor.convert(wiki_page))
diff --git a/setup.cfg b/setup.cfg
deleted file mode 100644
index 5e409001..00000000
--- a/setup.cfg
+++ /dev/null
@@ -1,2 +0,0 @@
-[wheel]
-universal = 1
diff --git a/setup.py b/setup.py
old mode 100644
new mode 100755
index 1d374829..05dba326
--- a/setup.py
+++ b/setup.py
@@ -15,17 +15,13 @@
classifiers = """\
Development Status :: 5 - Production/Stable
Intended Audience :: Developers
-License :: OSI Approved :: MIT License
Programming Language :: Python
-Programming Language :: Python :: 2
-Programming Language :: Python :: 2.6
-Programming Language :: Python :: 2.7
Programming Language :: Python :: 3
-Programming Language :: Python :: 3.3
-Programming Language :: Python :: 3.4
-Programming Language :: Python :: 3.5
-Programming Language :: Python :: 3.6
-Programming Language :: Python :: 3.7
+Programming Language :: Python :: 3.9
+Programming Language :: Python :: 3.10
+Programming Language :: Python :: 3.11
+Programming Language :: Python :: 3.12
+Programming Language :: Python :: 3.13
Operating System :: OS Independent
Topic :: Software Development :: Libraries :: Python Modules
Topic :: Software Development :: Documentation
@@ -33,7 +29,13 @@
Topic :: Text Processing :: Markup :: HTML
"""
-script = (sys.platform == "win32" and "lib\\markdown2.py" or "bin/markdown2")
+extras_require = {
+ "code_syntax_highlighting": ["pygments>=2.7.3"],
+ "wavedrom": ["wavedrom"],
+ "latex": ['latex2mathml; python_version>="3.8.1"'],
+}
+# nested listcomp to combine all optional extras into convenient "all" option
+extras_require["all"] = [i for v in tuple(extras_require.values()) for i in v]
setup(
name="markdown2",
@@ -47,9 +49,15 @@
platforms=["any"],
py_modules=["markdown2"],
package_dir={"": "lib"},
- scripts=[script],
+ entry_points={
+ "console_scripts": [
+ "markdown2 = markdown2:main"
+ ]
+ },
description="A fast and complete Python implementation of Markdown",
- classifiers=filter(None, classifiers.split("\n")),
+ python_requires=">=3.9, <4",
+ extras_require=extras_require,
+ classifiers=classifiers.strip().split("\n"),
long_description="""markdown2: A fast and complete Python implementation of Markdown.
Markdown is a text-to-HTML filter; it translates an easy-to-read /
diff --git a/test/markdown.py b/test/markdown.py
index 07334a81..0e99f6d7 100644
--- a/test/markdown.py
+++ b/test/markdown.py
@@ -29,16 +29,17 @@
"""
-import re, sys, os, random, codecs
+from pathlib import Path
+import re, sys
# Set debug level: 3 none, 2 critical, 1 informative, 0 all
-(VERBOSE, INFO, CRITICAL, NONE) = range(4)
+(VERBOSE, INFO, CRITICAL, NONE) = list(range(4))
MESSAGE_THRESHOLD = CRITICAL
def message(level, text) :
if level >= MESSAGE_THRESHOLD :
- print text
+ print(text)
# --------------- CONSTANTS YOU MIGHT WANT TO MODIFY -----------------
@@ -48,9 +49,9 @@ def message(level, text) :
SMART_EMPHASIS = 1 # this_or_that does not become thisorthat
HTML_REMOVED_TEXT = "[HTML_REMOVED]" # text used instead of HTML in safe mode
-RTL_BIDI_RANGES = ( (u'\u0590', u'\u07FF'),
+RTL_BIDI_RANGES = ( ('\u0590', '\u07FF'),
# from Hebrew to Nko (includes Arabic, Syriac and Thaana)
- (u'\u2D30', u'\u2D7F'),
+ ('\u2D30', '\u2D7F'),
# Tifinagh
)
@@ -62,9 +63,9 @@ def message(level, text) :
# 0780-07BF - Thaana
# 07C0-07FF - Nko
-BOMS = { 'utf-8' : (unicode(codecs.BOM_UTF8, "utf-8"), ),
- 'utf-16' : (unicode(codecs.BOM_UTF16_LE, "utf-16"),
- unicode(codecs.BOM_UTF16_BE, "utf-16")),
+BOMS = { 'utf-8' : (str(codecs.BOM_UTF8, "utf-8"), ),
+ 'utf-16' : (str(codecs.BOM_UTF16_LE, "utf-16"),
+ str(codecs.BOM_UTF16_BE, "utf-16")),
#'utf-32' : (unicode(codecs.BOM_UTF32_LE, "utf-32"),
# unicode(codecs.BOM_UTF32_BE, "utf-32")),
}
@@ -81,7 +82,7 @@ def removeBOM(text, encoding):
# and uses the actual name of the executable called.)
EXECUTABLE_NAME_FOR_USAGE = "python markdown.py"
-
+
# --------------- CONSTANTS YOU _SHOULD NOT_ HAVE TO CHANGE ----------
@@ -116,7 +117,7 @@ def is_block_level (tag) :
(re.compile(">"), ">"),
(re.compile("\""), """)]
-ENTITY_NORMALIZATION_EXPRESSIONS_SOFT = [ (re.compile("&(?!\#)"), "&"),
+ENTITY_NORMALIZATION_EXPRESSIONS_SOFT = [ (re.compile(r"&(?!\#)"), "&"),
(re.compile("<"), "<"),
(re.compile(">"), ">"),
(re.compile("\""), """)]
@@ -128,7 +129,7 @@ def getBidiType(text) :
ch = text[0]
- if not isinstance(ch, unicode) or not ch.isalpha():
+ if not isinstance(ch, str) or not ch.isalpha():
return None
else :
@@ -306,26 +307,26 @@ def toxml(self):
childBuffer += "/>"
-
+
buffer += "<" + self.nodeName
if self.nodeName in ['p', 'li', 'ul', 'ol',
'h1', 'h2', 'h3', 'h4', 'h5', 'h6'] :
- if not self.attribute_values.has_key("dir"):
+ if "dir" not in self.attribute_values:
if self.bidi :
bidi = self.bidi
else :
bidi = self.doc.bidi
-
+
if bidi=="rtl" :
self.setAttribute("dir", "rtl")
-
+
for attr in self.attributes :
value = self.attribute_values[attr]
value = self.doc.normalizeEntities(value,
avoidDoubleNormalizing=True)
- buffer += ' %s="%s"' % (attr, value)
+ buffer += ' {}="{}"'.format(attr, value)
# Now let's actually append the children
@@ -345,7 +346,7 @@ class TextNode :
attrRegExp = re.compile(r'\{@([^\}]*)=([^\}]*)}') # {@id=123}
def __init__ (self, text) :
- self.value = text
+ self.value = text
def attributeCallback(self, match) :
@@ -359,7 +360,7 @@ def toxml(self) :
text = self.value
self.parent.setBidi(getBidiType(text))
-
+
if not text.startswith(HTML_PLACEHOLDER_PREFIX):
if self.parent.nodeName == "p" :
text = text.replace("\n", "\n ")
@@ -479,7 +480,7 @@ def run (self, lines) :
class HtmlBlockPreprocessor (Preprocessor):
"""Removes html blocks from self.lines"""
-
+
def _get_left_tag(self, block):
return block[1:].replace(">", " ", 1).split()[0].lower()
@@ -488,7 +489,7 @@ def _get_right_tag(self, left_tag, block):
return block.rstrip()[-len(left_tag)-2:-1].lower()
def _equal_tags(self, left_tag, right_tag):
-
+
if left_tag in ['?', '?php', 'div'] : # handle PHP, etc.
return True
if ("/" + left_tag) == right_tag:
@@ -504,18 +505,18 @@ def _equal_tags(self, left_tag, right_tag):
def _is_oneliner(self, tag):
return (tag in ['hr', 'hr/'])
-
+
def run (self, lines) :
new_blocks = []
text = "\n".join(lines)
text = text.split("\n\n")
-
+
items = []
left_tag = ''
right_tag = ''
in_tag = False # flag
-
+
for block in text:
if block.startswith("\n") :
block = block[1:]
@@ -523,7 +524,7 @@ def run (self, lines) :
if not in_tag:
if block.startswith("<"):
-
+
left_tag = self._get_left_tag(block)
right_tag = self._get_right_tag(left_tag, block)
@@ -535,13 +536,13 @@ def run (self, lines) :
if self._is_oneliner(left_tag):
new_blocks.append(block.strip())
continue
-
+
if block[1] == "!":
# is a comment block
left_tag = "--"
right_tag = self._get_right_tag(left_tag, block)
# keep checking conditions below and maybe just append
-
+
if block.rstrip().endswith(">") \
and self._equal_tags(left_tag, right_tag):
new_blocks.append(
@@ -557,9 +558,9 @@ def run (self, lines) :
else:
items.append(block.strip())
-
+
right_tag = self._get_right_tag(left_tag, block)
-
+
if self._equal_tags(left_tag, right_tag):
# if find closing tag
in_tag = False
@@ -570,7 +571,7 @@ def run (self, lines) :
if items :
new_blocks.append(self.stash.store('\n\n'.join(items)))
new_blocks.append('\n')
-
+
return "\n\n".join(new_blocks).split("\n")
HTML_BLOCK_PREPROCESSOR = HtmlBlockPreprocessor()
@@ -672,7 +673,7 @@ def run (self, lines) :
LINK_ANGLED_RE = BRK + r'\s*\(<([^\)]*)>\)' # [text](
Missing tailing pipe
+
+With a space at the end of the underline row
+
+
+
+
diff --git a/test/php-markdown-extra-cases/Tables.text b/test/php-markdown-extra-cases/Tables.text
index b761ad91..1c1d31cb 100644
--- a/test/php-markdown-extra-cases/Tables.text
+++ b/test/php-markdown-extra-cases/Tables.text
@@ -101,3 +101,10 @@ Header 1 | Header 2 |
--------- | --------- |
Cell | Cell |
Cell | Cell
+
+# With a space at the end of the underline row
+
+Header 1 | Header 2 |
+--------- | --------- |
+Cell 1 | Cell 2 |
+Cell 3 | Cell 4 |
diff --git a/test/test.py b/test/test.py
index c69df78f..995db47a 100755
--- a/test/test.py
+++ b/test/test.py
@@ -4,11 +4,11 @@
"""The markdown2 test suite entry point."""
+import importlib
import os
-from os.path import exists, join, abspath, dirname, normpath
+from os.path import join, abspath, dirname
import sys
import logging
-
import testlib
log = logging.getLogger("test")
@@ -24,25 +24,37 @@ def setup():
# Attempt to get 'pygments' on the import path.
try:
# If already have it, use that one.
- import pygments
+ import pygments # noqa
except ImportError:
pygments_dir = join(top_dir, "deps", "pygments")
- if sys.version_info[0] <= 2:
- sys.path.insert(0, pygments_dir)
- else:
- sys.path.insert(0, pygments_dir + "3")
+ sys.path.insert(0, pygments_dir + "3")
if __name__ == "__main__":
logging.basicConfig()
setup()
default_tags = []
- try:
- import pygments
- except ImportError:
- log.warning("skipping pygments tests ('pygments' module not found)")
- default_tags.append("-pygments")
+ warnings = []
+ for extra_lib in ('pygments', 'wavedrom', 'latex2mathml'):
+ try:
+ mod = importlib.import_module(extra_lib)
+ except ImportError:
+ warnings.append("skipping {} tests ('{}' module not found)".format(extra_lib, extra_lib))
+ default_tags.append("-%s" % extra_lib)
+ else:
+ if extra_lib == 'pygments':
+ version = tuple(int(i) for i in mod.__version__.split('.')[:3])
+ if version >= (2, 14, 0):
+ tag = "pygments<2.14"
+ else:
+ tag = "pygments>=2.14"
+ warnings.append("skipping {} tests (pygments {} found)".format(tag, mod.__version__))
+ default_tags.append("-%s" % tag)
retval = testlib.harness(testdir_from_ns=testdir_from_ns,
default_tags=default_tags)
+
+ for warning in warnings:
+ log.warning(warning)
+
sys.exit(retval)
diff --git a/test/test_markdown2.py b/test/test_markdown2.py
index 363934f8..fb433efc 100755
--- a/test/test_markdown2.py
+++ b/test/test_markdown2.py
@@ -9,14 +9,12 @@
from os.path import join, dirname, abspath, exists, splitext, basename
import re
from glob import glob
-from pprint import pprint
import unittest
import codecs
import difflib
import doctest
from json import loads as json_loads
-
-from testlib import TestError, TestSkipped, tag
+import warnings
sys.path.insert(0, join(dirname(dirname(abspath(__file__)))))
try:
@@ -24,26 +22,6 @@
finally:
del sys.path[0]
-
-
-#---- Python version compat
-
-# Use `bytes` for byte strings and `unicode` for unicode strings (str in Py3).
-if sys.version_info[0] <= 2:
- py3 = False
- try:
- bytes
- except NameError:
- bytes = str
- base_string_type = basestring
-elif sys.version_info[0] >= 3:
- py3 = True
- unicode = str
- base_string_type = str
- unichr = chr
-
-
-
#---- Test cases
class _MarkdownTestCase(unittest.TestCase):
@@ -73,16 +51,19 @@ def _assertMarkdownParity(self, text):
def _assertMarkdownPath(self, text_path, encoding="utf-8", opts=None,
toc_html_path=None, metadata_path=None):
- text = codecs.open(text_path, 'r', encoding=encoding).read()
+ with open(text_path, 'r', encoding=encoding) as f:
+ text = f.read()
html_path = splitext(text_path)[0] + ".html"
- html = codecs.open(html_path, 'r', encoding=encoding).read()
+ with open(html_path, 'r', encoding=encoding) as f:
+ html = f.read()
extra = {}
if toc_html_path:
- extra["toc_html"] = codecs.open(toc_html_path, 'r', encoding=encoding).read()
+ with open(toc_html_path, 'r', encoding=encoding) as f:
+ extra["toc_html"] = f.read()
extra["toc_html_path"] = toc_html_path
if metadata_path:
- extra["metadata"] = json_loads(
- codecs.open(metadata_path, 'r', encoding=encoding).read())
+ with open(metadata_path, 'r', encoding=encoding) as f:
+ extra["metadata"] = json_loads(f.read())
extra["metadata_path"] = metadata_path
self._assertMarkdown(text, html, text_path, html_path, opts=opts, **extra)
@@ -126,12 +107,8 @@ def _assertMarkdown(self, text, html, text_path=None, html_path=None,
def charreprreplace(exc):
if not isinstance(exc, UnicodeEncodeError):
raise TypeError("don't know how to handle %r" % exc)
- if py3:
- obj_repr = repr(exc.object[exc.start:exc.end])[1:-1]
- else:
- # repr -> remote "u'" and "'"
- obj_repr = repr(exc.object[exc.start:exc.end])[2:-1]
- return (unicode(obj_repr), exc.end)
+ obj_repr = repr(exc.object[exc.start:exc.end])[1:-1]
+ return (str(obj_repr), exc.end)
codecs.register_error("charreprreplace", charreprreplace)
self.assertEqual(python_norm_html, norm_html, errmsg)
@@ -177,11 +154,15 @@ def generate_tests(cls):
opts_path = splitext(text_path)[0] + ".opts"
if exists(opts_path):
try:
- opts = eval(open(opts_path, 'r').read())
+ with warnings.catch_warnings(record=True) as caught_warnings:
+ with open(opts_path) as f:
+ opts = eval(f.read())
+ for warning in caught_warnings:
+ print("WARNING: loading %s generated warning: %s - lineno %d" % (opts_path, warning.message, warning.lineno), file=sys.stderr)
except Exception:
_, ex, _ = sys.exc_info()
print("WARNING: couldn't load `%s' opts file: %s" \
- % (opts_path, ex))
+ % (opts_path, ex), file=sys.stderr)
toc_html_path = splitext(text_path)[0] + ".toc_html"
if not exists(toc_html_path):
@@ -198,10 +179,11 @@ def generate_tests(cls):
tags_path = splitext(text_path)[0] + ".tags"
if exists(tags_path):
tags = []
- for line in open(tags_path):
- if '#' in line: # allow comments in .tags files
- line = line[:line.index('#')]
- tags += line.split()
+ with open(tags_path) as f:
+ for line in f:
+ if '#' in line: # allow comments in .tags files
+ line = line[:line.index('#')]
+ tags += line.split()
test_func.tags = tags
name = splitext(basename(text_path))[0]
@@ -285,7 +267,7 @@ def test_pre(self):
'
+
+
+
+Header 1
+ Header 2
+
+
+Cell 1
+ Cell 2
+
+
+
+Cell 3
+ Cell 4
+
\n')
def test_russian(self):
- ko = '\u043b\u0449' # 'ko' on russian keyboard
+ ko = '\\u043b\\u0449' # 'ko' on russian keyboard
self._assertMarkdown("## %s" % ko,
'#!/usr/bin/python\nprint "hi"\n%s
\n' % ko)
test_russian.tags = ["unicode", "issue3"]
@@ -342,13 +324,6 @@ def test_api(self):
test = doctest.DocFileTest("api.doctests")
test.runTest()
- # Don't bother on Python 3 because (a) there aren't many inline doctests,
- # and (b) they are more to be didactic than comprehensive test suites.
- if not py3:
- def test_internal(self):
- doctest.testmod(markdown2)
-
-
#---- internal support stuff
@@ -356,16 +331,16 @@ def test_internal(self):
def _xml_escape_sub(match):
escape = match.group(1)
if escape[0] == 'x':
- return unichr(int('0'+escape, base=16))
+ return chr(int('0'+escape, base=16))
else:
- return unichr(int(escape))
+ return chr(int(escape))
_markdown_email_link_re = re.compile(r'(.*?)', re.U)
def _markdown_email_link_sub(match):
href, text = match.groups()
href = _xml_escape_re.sub(_xml_escape_sub, href)
text = _xml_escape_re.sub(_xml_escape_sub, text)
- return '%s' % (href, text)
+ return '{}'.format(href, text)
def norm_html_from_html(html):
"""Normalize (somewhat) Markdown'd HTML.
@@ -375,18 +350,17 @@ def norm_html_from_html(html):
Also normalize EOLs.
"""
- if not isinstance(html, unicode):
+ if not isinstance(html, str):
html = html.decode('utf-8')
html = _markdown_email_link_re.sub(
_markdown_email_link_sub, html)
- if sys.platform == "win32":
- html = html.replace('\r\n', '\n')
+ html = html.replace('\r\n', '\n')
return html
def _display(s):
"""Markup the given string for useful display."""
- if not isinstance(s, unicode):
+ if not isinstance(s, str):
s = s.decode("utf-8")
s = _indent(_escaped_text_from_text(s, "whitespace"), 4)
if not s.endswith('\n'):
@@ -533,7 +507,7 @@ def _escaped_text_from_text(text, escapes="eol"):
# - Add _escaped_html_from_text() with a similar call sig.
import re
- if isinstance(escapes, base_string_type):
+ if isinstance(escapes, str):
if escapes == "eol":
escapes = {'\r\n': "\\r\\n\r\n", '\n': "\\n\n", '\r': "\\r\r"}
elif escapes == "whitespace":
diff --git a/test/test_redos.py b/test/test_redos.py
new file mode 100644
index 00000000..3bea176f
--- /dev/null
+++ b/test/test_redos.py
@@ -0,0 +1,96 @@
+import logging
+import subprocess
+import sys
+import time
+from pathlib import Path
+
+log = logging.getLogger("test")
+LIB_DIR = Path(__file__).parent.parent / "lib"
+
+
+def pull_387_example_1():
+ # https://github.com/trentm/python-markdown2/pull/387
+ return "[#a" + " " * 3456
+
+
+def pull_387_example_2():
+ # https://github.com/trentm/python-markdown2/pull/387
+ return "```" + "\n" * 3456
+
+
+def pull_387_example_3():
+ # https://github.com/trentm/python-markdown2/pull/387
+ return "-*-" + " " * 3456
+
+
+def pull_402():
+ # https://github.com/trentm/python-markdown2/pull/402
+ return " " * 100_000 + "$"
+
+
+def issue493():
+ # https://github.com/trentm/python-markdown2/issues/493
+ return "**_" + "*_" * 38730 * 10 + "\x00"
+
+
+def issue_633():
+ # https://github.com/trentm/python-markdown2/issues/633
+ return '
+
+
diff --git a/test/tm-cases/admonitions.opts b/test/tm-cases/admonitions.opts
new file mode 100644
index 00000000..44e237fc
--- /dev/null
+++ b/test/tm-cases/admonitions.opts
@@ -0,0 +1 @@
+{"extras": ["admonitions"]}
\ No newline at end of file
diff --git a/test/tm-cases/admonitions.text b/test/tm-cases/admonitions.text
new file mode 100644
index 00000000..b79ff8a1
--- /dev/null
+++ b/test/tm-cases/admonitions.text
@@ -0,0 +1,38 @@
+.. NOTE:: Admonitions
+ They contain 3 main parts, the admonition type, title and body.
+
+ The admonition type is case insensitive, title is optional and the body
+ should be able to contain pretty much anything. For example:
+
+ - Lists
+ - With multiple levels
+ - Of indentation
+
+ And code blocks:
+
+ print('indented code blocks')
+
+.. warning::
+ The admonition's body must be indented by a tab or 3 or more spaces
+ from where the admonition was declared
+ Otherwise the text is no longer part of the admonition.
+
+.. IMPORTANT::
+ You can also use 3 or more empty lines after an admonition
+ to end it
+
+
+
+ print('In case you wanted something like')
+ print('an indented code block right after')
+
+.. admonition:: Generic admonitions
+
+ These should be given a title but this is not enforced
+
+ .. note:: Nested admonitions
+ Nested admonitions should also work
+
+ - Even inside
+ .. tip::
+ of a list
\ No newline at end of file
diff --git a/test/tm-cases/admonitions_with_fenced_code_blocks.html b/test/tm-cases/admonitions_with_fenced_code_blocks.html
new file mode 100644
index 00000000..7428c571
--- /dev/null
+++ b/test/tm-cases/admonitions_with_fenced_code_blocks.html
@@ -0,0 +1,35 @@
+print('In case you wanted something like')
+print('an indented code block right after')
+
+ print('like so')
+
+ print('Consecutive blocks should also be fine')
+
+ print('Even though fenced code blocks wrap themselves in newlines')
+
+ print('ok')
+
+# admonitions WITHIN fenced code blocks should NOT be rendered
+.. attention:: title
+ body
+
+twowimport picklex

blah
+
+Now some bullets:
+
+ * one
+ * two
+
+
+
+
+