aboutsummaryrefslogtreecommitdiff
path: root/markdown
diff options
context:
space:
mode:
Diffstat (limited to 'markdown')
-rw-r--r--markdown/__init__.py616
-rw-r--r--markdown/__main__.py151
-rw-r--r--markdown/__meta__.py49
-rw-r--r--markdown/blockparser.py92
-rw-r--r--markdown/blockprocessors.py423
-rw-r--r--markdown/commandline.py96
-rw-r--r--markdown/core.py407
-rw-r--r--markdown/etree_loader.py33
-rw-r--r--markdown/extensions/__init__.py86
-rw-r--r--markdown/extensions/abbr.py114
-rw-r--r--markdown/extensions/admonition.py170
-rw-r--r--markdown/extensions/attr_list.py166
-rw-r--r--markdown/extensions/codehilite.py400
-rw-r--r--markdown/extensions/def_list.py87
-rw-r--r--markdown/extensions/extra.py53
-rw-r--r--markdown/extensions/fenced_code.py223
-rw-r--r--markdown/extensions/footnotes.py474
-rw-r--r--markdown/extensions/headerid.py195
-rw-r--r--markdown/extensions/html_tidy.py62
-rw-r--r--markdown/extensions/imagelinks.py119
-rw-r--r--markdown/extensions/legacy_attrs.py67
-rw-r--r--markdown/extensions/legacy_em.py49
-rw-r--r--markdown/extensions/md_in_html.py364
-rw-r--r--markdown/extensions/meta.py83
-rw-r--r--markdown/extensions/nl2br.py33
-rw-r--r--markdown/extensions/rss.py114
-rw-r--r--markdown/extensions/sane_lists.py54
-rw-r--r--markdown/extensions/smarty.py257
-rw-r--r--markdown/extensions/tables.py223
-rw-r--r--markdown/extensions/toc.py472
-rw-r--r--markdown/extensions/wikilinks.py156
-rw-r--r--markdown/html4.py274
-rw-r--r--markdown/htmlparser.py323
-rw-r--r--markdown/inlinepatterns.py875
-rw-r--r--markdown/odict.py162
-rw-r--r--markdown/postprocessors.py124
-rw-r--r--markdown/preprocessors.py223
-rw-r--r--markdown/serializers.py189
-rw-r--r--markdown/test_tools.py220
-rw-r--r--markdown/treeprocessors.py307
-rw-r--r--markdown/util.py358
41 files changed, 5799 insertions, 3144 deletions
diff --git a/markdown/__init__.py b/markdown/__init__.py
index bd52113..d88b1e9 100644
--- a/markdown/__init__.py
+++ b/markdown/__init__.py
@@ -1,614 +1,28 @@
"""
Python Markdown
-===============
-Python Markdown converts Markdown to HTML and can be used as a library or
-called from the command line.
+A Python implementation of John Gruber's Markdown.
-## Basic usage as a module:
+Documentation: https://python-markdown.github.io/
+GitHub: https://github.com/Python-Markdown/markdown/
+PyPI: https://pypi.org/project/Markdown/
- import markdown
- md = Markdown()
- html = md.convert(your_text_string)
+Started by Manfred Stienstra (http://www.dwerg.net/).
+Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
+Currently maintained by Waylan Limberg (https://github.com/waylan),
+Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
-## Basic use from the command line:
-
- markdown source.txt > destination.html
-
-Run "markdown --help" to see more options.
-
-## Extensions
-
-See <http://www.freewisdom.org/projects/python-markdown/> for more
-information and instructions on how to extend the functionality of
-Python Markdown. Read that before you try modifying this file.
-
-## Authors and License
-
-Started by [Manfred Stienstra](http://www.dwerg.net/). Continued and
-maintained by [Yuri Takhteyev](http://www.freewisdom.org), [Waylan
-Limberg](http://achinghead.com/) and [Artem Yunusov](http://blog.splyer.com).
-
-Contact: markdown@freewisdom.org
-
-Copyright 2007, 2008 The Python Markdown Project (v. 1.7 and later)
-Copyright 200? Django Software Foundation (OrderedDict implementation)
+Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later)
Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
Copyright 2004 Manfred Stienstra (the original version)
-License: BSD (see docs/LICENSE for details).
-"""
-
-version = "2.0.3"
-version_info = (2,0,3, "Final")
-
-import re
-import codecs
-import sys
-import warnings
-import logging
-from logging import DEBUG, INFO, WARN, ERROR, CRITICAL
-
-
-"""
-CONSTANTS
-=============================================================================
-"""
-
-"""
-Constants you might want to modify
------------------------------------------------------------------------------
-"""
-
-# default logging level for command-line use
-COMMAND_LINE_LOGGING_LEVEL = CRITICAL
-TAB_LENGTH = 4 # expand tabs to this many spaces
-ENABLE_ATTRIBUTES = True # @id = xyz -> <... id="xyz">
-SMART_EMPHASIS = True # this_or_that does not become this<i>or</i>that
-DEFAULT_OUTPUT_FORMAT = 'xhtml1' # xhtml or html4 output
-HTML_REMOVED_TEXT = "[HTML_REMOVED]" # text used instead of HTML in safe mode
-BLOCK_LEVEL_ELEMENTS = re.compile("p|div|h[1-6]|blockquote|pre|table|dl|ol|ul"
- "|script|noscript|form|fieldset|iframe|math"
- "|ins|del|hr|hr/|style|li|dt|dd|thead|tbody"
- "|tr|th|td")
-DOC_TAG = "div" # Element used to wrap document - later removed
-
-# Placeholders
-STX = u'\u0002' # Use STX ("Start of text") for start-of-placeholder
-ETX = u'\u0003' # Use ETX ("End of text") for end-of-placeholder
-INLINE_PLACEHOLDER_PREFIX = STX+"klzzwxh:"
-INLINE_PLACEHOLDER = INLINE_PLACEHOLDER_PREFIX + "%s" + ETX
-AMP_SUBSTITUTE = STX+"amp"+ETX
-
-
-"""
-Constants you probably do not need to change
------------------------------------------------------------------------------
-"""
-
-RTL_BIDI_RANGES = ( (u'\u0590', u'\u07FF'),
- # Hebrew (0590-05FF), Arabic (0600-06FF),
- # Syriac (0700-074F), Arabic supplement (0750-077F),
- # Thaana (0780-07BF), Nko (07C0-07FF).
- (u'\u2D30', u'\u2D7F'), # Tifinagh
- )
-
-
-"""
-AUXILIARY GLOBAL FUNCTIONS
-=============================================================================
-"""
-
-
-def message(level, text):
- """ A wrapper method for logging debug messages. """
- logger = logging.getLogger('MARKDOWN')
- if logger.handlers:
- # The logger is configured
- logger.log(level, text)
- if level > WARN:
- sys.exit(0)
- elif level > WARN:
- raise MarkdownException, text
- else:
- warnings.warn(text, MarkdownWarning)
-
-
-def isBlockLevel(tag):
- """Check if the tag is a block level HTML tag."""
- return BLOCK_LEVEL_ELEMENTS.match(tag)
-
-"""
-MISC AUXILIARY CLASSES
-=============================================================================
+License: BSD (see LICENSE.md for details).
"""
-class AtomicString(unicode):
- """A string which should not be further processed."""
- pass
-
-
-class MarkdownException(Exception):
- """ A Markdown Exception. """
- pass
-
-
-class MarkdownWarning(Warning):
- """ A Markdown Warning. """
- pass
-
-
-"""
-OVERALL DESIGN
-=============================================================================
-
-Markdown processing takes place in four steps:
-
-1. A bunch of "preprocessors" munge the input text.
-2. BlockParser() parses the high-level structural elements of the
- pre-processed text into an ElementTree.
-3. A bunch of "treeprocessors" are run against the ElementTree. One such
- treeprocessor runs InlinePatterns against the ElementTree, detecting inline
- markup.
-4. Some post-processors are run against the text after the ElementTree has
- been serialized into text.
-5. The output is written to a string.
-
-Those steps are put together by the Markdown() class.
-
-"""
-
-import preprocessors
-import blockprocessors
-import treeprocessors
-import inlinepatterns
-import postprocessors
-import blockparser
-import etree_loader
-import odict
-
-# Extensions should use "markdown.etree" instead of "etree" (or do `from
-# markdown import etree`). Do not import it by yourself.
-
-etree = etree_loader.importETree()
-
-# Adds the ability to output html4
-import html4
-
-
-class Markdown:
- """Convert Markdown to HTML."""
-
- def __init__(self,
- extensions=[],
- extension_configs={},
- safe_mode = False,
- output_format=DEFAULT_OUTPUT_FORMAT):
- """
- Creates a new Markdown instance.
-
- Keyword arguments:
-
- * extensions: A list of extensions.
- If they are of type string, the module mdx_name.py will be loaded.
- If they are a subclass of markdown.Extension, they will be used
- as-is.
- * extension-configs: Configuration setting for extensions.
- * safe_mode: Disallow raw html. One of "remove", "replace" or "escape".
- * output_format: Format of output. Supported formats are:
- * "xhtml1": Outputs XHTML 1.x. Default.
- * "xhtml": Outputs latest supported version of XHTML (currently XHTML 1.1).
- * "html4": Outputs HTML 4
- * "html": Outputs latest supported version of HTML (currently HTML 4).
- Note that it is suggested that the more specific formats ("xhtml1"
- and "html4") be used as "xhtml" or "html" may change in the future
- if it makes sense at that time.
-
- """
-
- self.safeMode = safe_mode
- self.registeredExtensions = []
- self.docType = ""
- self.stripTopLevelTags = True
-
- # Preprocessors
- self.preprocessors = odict.OrderedDict()
- self.preprocessors["html_block"] = \
- preprocessors.HtmlBlockPreprocessor(self)
- self.preprocessors["reference"] = \
- preprocessors.ReferencePreprocessor(self)
- # footnote preprocessor will be inserted with "<reference"
-
- # Block processors - ran by the parser
- self.parser = blockparser.BlockParser()
- self.parser.blockprocessors['empty'] = \
- blockprocessors.EmptyBlockProcessor(self.parser)
- self.parser.blockprocessors['indent'] = \
- blockprocessors.ListIndentProcessor(self.parser)
- self.parser.blockprocessors['code'] = \
- blockprocessors.CodeBlockProcessor(self.parser)
- self.parser.blockprocessors['hashheader'] = \
- blockprocessors.HashHeaderProcessor(self.parser)
- self.parser.blockprocessors['setextheader'] = \
- blockprocessors.SetextHeaderProcessor(self.parser)
- self.parser.blockprocessors['hr'] = \
- blockprocessors.HRProcessor(self.parser)
- self.parser.blockprocessors['olist'] = \
- blockprocessors.OListProcessor(self.parser)
- self.parser.blockprocessors['ulist'] = \
- blockprocessors.UListProcessor(self.parser)
- self.parser.blockprocessors['quote'] = \
- blockprocessors.BlockQuoteProcessor(self.parser)
- self.parser.blockprocessors['paragraph'] = \
- blockprocessors.ParagraphProcessor(self.parser)
-
-
- #self.prePatterns = []
-
- # Inline patterns - Run on the tree
- self.inlinePatterns = odict.OrderedDict()
- self.inlinePatterns["backtick"] = \
- inlinepatterns.BacktickPattern(inlinepatterns.BACKTICK_RE)
- self.inlinePatterns["escape"] = \
- inlinepatterns.SimpleTextPattern(inlinepatterns.ESCAPE_RE)
- self.inlinePatterns["reference"] = \
- inlinepatterns.ReferencePattern(inlinepatterns.REFERENCE_RE, self)
- self.inlinePatterns["link"] = \
- inlinepatterns.LinkPattern(inlinepatterns.LINK_RE, self)
- self.inlinePatterns["image_link"] = \
- inlinepatterns.ImagePattern(inlinepatterns.IMAGE_LINK_RE, self)
- self.inlinePatterns["image_reference"] = \
- inlinepatterns.ImageReferencePattern(inlinepatterns.IMAGE_REFERENCE_RE, self)
- self.inlinePatterns["autolink"] = \
- inlinepatterns.AutolinkPattern(inlinepatterns.AUTOLINK_RE, self)
- self.inlinePatterns["automail"] = \
- inlinepatterns.AutomailPattern(inlinepatterns.AUTOMAIL_RE, self)
- self.inlinePatterns["linebreak2"] = \
- inlinepatterns.SubstituteTagPattern(inlinepatterns.LINE_BREAK_2_RE, 'br')
- self.inlinePatterns["linebreak"] = \
- inlinepatterns.SubstituteTagPattern(inlinepatterns.LINE_BREAK_RE, 'br')
- self.inlinePatterns["html"] = \
- inlinepatterns.HtmlPattern(inlinepatterns.HTML_RE, self)
- self.inlinePatterns["entity"] = \
- inlinepatterns.HtmlPattern(inlinepatterns.ENTITY_RE, self)
- self.inlinePatterns["not_strong"] = \
- inlinepatterns.SimpleTextPattern(inlinepatterns.NOT_STRONG_RE)
- self.inlinePatterns["strong_em"] = \
- inlinepatterns.DoubleTagPattern(inlinepatterns.STRONG_EM_RE, 'strong,em')
- self.inlinePatterns["strong"] = \
- inlinepatterns.SimpleTagPattern(inlinepatterns.STRONG_RE, 'strong')
- self.inlinePatterns["emphasis"] = \
- inlinepatterns.SimpleTagPattern(inlinepatterns.EMPHASIS_RE, 'em')
- self.inlinePatterns["emphasis2"] = \
- inlinepatterns.SimpleTagPattern(inlinepatterns.EMPHASIS_2_RE, 'em')
- # The order of the handlers matters!!!
-
-
- # Tree processors - run once we have a basic parse.
- self.treeprocessors = odict.OrderedDict()
- self.treeprocessors["inline"] = treeprocessors.InlineProcessor(self)
- self.treeprocessors["prettify"] = \
- treeprocessors.PrettifyTreeprocessor(self)
-
- # Postprocessors - finishing touches.
- self.postprocessors = odict.OrderedDict()
- self.postprocessors["raw_html"] = \
- postprocessors.RawHtmlPostprocessor(self)
- self.postprocessors["amp_substitute"] = \
- postprocessors.AndSubstitutePostprocessor()
- # footnote postprocessor will be inserted with ">amp_substitute"
-
- # Map format keys to serializers
- self.output_formats = {
- 'html' : html4.to_html_string,
- 'html4' : html4.to_html_string,
- 'xhtml' : etree.tostring,
- 'xhtml1': etree.tostring,
- }
-
- self.references = {}
- self.htmlStash = preprocessors.HtmlStash()
- self.registerExtensions(extensions = extensions,
- configs = extension_configs)
- self.set_output_format(output_format)
- self.reset()
-
- def registerExtensions(self, extensions, configs):
- """
- Register extensions with this instance of Markdown.
-
- Keyword aurguments:
-
- * extensions: A list of extensions, which can either
- be strings or objects. See the docstring on Markdown.
- * configs: A dictionary mapping module names to config options.
-
- """
- for ext in extensions:
- if isinstance(ext, basestring):
- ext = load_extension(ext, configs.get(ext, []))
- if isinstance(ext, Extension):
- try:
- ext.extendMarkdown(self, globals())
- except NotImplementedError, e:
- message(ERROR, e)
- else:
- message(ERROR, 'Extension "%s.%s" must be of type: "markdown.Extension".' \
- % (ext.__class__.__module__, ext.__class__.__name__))
-
- def registerExtension(self, extension):
- """ This gets called by the extension """
- self.registeredExtensions.append(extension)
-
- def reset(self):
- """
- Resets all state variables so that we can start with a new text.
- """
- self.htmlStash.reset()
- self.references.clear()
-
- for extension in self.registeredExtensions:
- extension.reset()
-
- def set_output_format(self, format):
- """ Set the output format for the class instance. """
- try:
- self.serializer = self.output_formats[format.lower()]
- except KeyError:
- message(CRITICAL, 'Invalid Output Format: "%s". Use one of %s.' \
- % (format, self.output_formats.keys()))
-
- def convert(self, source):
- """
- Convert markdown to serialized XHTML or HTML.
-
- Keyword arguments:
-
- * source: Source text as a Unicode string.
-
- """
-
- # Fixup the source text
- if not source.strip():
- return u"" # a blank unicode string
- try:
- source = unicode(source)
- except UnicodeDecodeError:
- message(CRITICAL, 'UnicodeDecodeError: Markdown only accepts unicode or ascii input.')
- return u""
-
- source = source.replace(STX, "").replace(ETX, "")
- source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n"
- source = re.sub(r'\n\s+\n', '\n\n', source)
- source = source.expandtabs(TAB_LENGTH)
-
- # Split into lines and run the line preprocessors.
- self.lines = source.split("\n")
- for prep in self.preprocessors.values():
- self.lines = prep.run(self.lines)
-
- # Parse the high-level elements.
- root = self.parser.parseDocument(self.lines).getroot()
-
- # Run the tree-processors
- for treeprocessor in self.treeprocessors.values():
- newRoot = treeprocessor.run(root)
- if newRoot:
- root = newRoot
-
- # Serialize _properly_. Strip top-level tags.
- output, length = codecs.utf_8_decode(self.serializer(root, encoding="utf-8"))
- if self.stripTopLevelTags:
- try:
- start = output.index('<%s>'%DOC_TAG)+len(DOC_TAG)+2
- end = output.rindex('</%s>'%DOC_TAG)
- output = output[start:end].strip()
- except ValueError:
- if output.strip().endswith('<%s />'%DOC_TAG):
- # We have an empty document
- output = ''
- else:
- # We have a serious problem
- message(CRITICAL, 'Failed to strip top level tags.')
-
- # Run the text post-processors
- for pp in self.postprocessors.values():
- output = pp.run(output)
-
- return output.strip()
-
- def convertFile(self, input=None, output=None, encoding=None):
- """Converts a markdown file and returns the HTML as a unicode string.
-
- Decodes the file using the provided encoding (defaults to utf-8),
- passes the file content to markdown, and outputs the html to either
- the provided stream or the file with provided name, using the same
- encoding as the source file.
-
- **Note:** This is the only place that decoding and encoding of unicode
- takes place in Python-Markdown. (All other code is unicode-in /
- unicode-out.)
-
- Keyword arguments:
-
- * input: Name of source text file.
- * output: Name of output file. Writes to stdout if `None`.
- * encoding: Encoding of input and output files. Defaults to utf-8.
-
- """
-
- encoding = encoding or "utf-8"
-
- # Read the source
- input_file = codecs.open(input, mode="r", encoding=encoding)
- text = input_file.read()
- input_file.close()
- text = text.lstrip(u'\ufeff') # remove the byte-order mark
-
- # Convert
- html = self.convert(text)
-
- # Write to file or stdout
- if isinstance(output, (str, unicode)):
- output_file = codecs.open(output, "w", encoding=encoding)
- output_file.write(html)
- output_file.close()
- else:
- output.write(html.encode(encoding))
-
-
-"""
-Extensions
------------------------------------------------------------------------------
-"""
-
-class Extension:
- """ Base class for extensions to subclass. """
- def __init__(self, configs = {}):
- """Create an instance of an Extention.
-
- Keyword arguments:
-
- * configs: A dict of configuration setting used by an Extension.
- """
- self.config = configs
-
- def getConfig(self, key):
- """ Return a setting for the given key or an empty string. """
- if key in self.config:
- return self.config[key][0]
- else:
- return ""
-
- def getConfigInfo(self):
- """ Return all config settings as a list of tuples. """
- return [(key, self.config[key][1]) for key in self.config.keys()]
-
- def setConfig(self, key, value):
- """ Set a config setting for `key` with the given `value`. """
- self.config[key][0] = value
-
- def extendMarkdown(self, md, md_globals):
- """
- Add the various proccesors and patterns to the Markdown Instance.
-
- This method must be overriden by every extension.
-
- Keyword arguments:
-
- * md: The Markdown instance.
-
- * md_globals: Global variables in the markdown module namespace.
-
- """
- raise NotImplementedError, 'Extension "%s.%s" must define an "extendMarkdown"' \
- 'method.' % (self.__class__.__module__, self.__class__.__name__)
-
-
-def load_extension(ext_name, configs = []):
- """Load extension by name, then return the module.
-
- The extension name may contain arguments as part of the string in the
- following format: "extname(key1=value1,key2=value2)"
-
- """
-
- # Parse extensions config params (ignore the order)
- configs = dict(configs)
- pos = ext_name.find("(") # find the first "("
- if pos > 0:
- ext_args = ext_name[pos+1:-1]
- ext_name = ext_name[:pos]
- pairs = [x.split("=") for x in ext_args.split(",")]
- configs.update([(x.strip(), y.strip()) for (x, y) in pairs])
-
- # Setup the module names
- ext_module = 'markdown.extensions'
- module_name_new_style = '.'.join([ext_module, ext_name])
- module_name_old_style = '_'.join(['mdx', ext_name])
-
- # Try loading the extention first from one place, then another
- try: # New style (markdown.extensons.<extension>)
- module = __import__(module_name_new_style, {}, {}, [ext_module])
- except ImportError:
- try: # Old style (mdx.<extension>)
- module = __import__(module_name_old_style)
- except ImportError:
- message(WARN, "Failed loading extension '%s' from '%s' or '%s'"
- % (ext_name, module_name_new_style, module_name_old_style))
- # Return None so we don't try to initiate none-existant extension
- return None
-
- # If the module is loaded successfully, we expect it to define a
- # function called makeExtension()
- try:
- return module.makeExtension(configs.items())
- except AttributeError:
- message(CRITICAL, "Failed to initiate extension '%s'" % ext_name)
-
-
-def load_extensions(ext_names):
- """Loads multiple extensions"""
- extensions = []
- for ext_name in ext_names:
- extension = load_extension(ext_name)
- if extension:
- extensions.append(extension)
- return extensions
-
-
-"""
-EXPORTED FUNCTIONS
-=============================================================================
-
-Those are the two functions we really mean to export: markdown() and
-markdownFromFile().
-"""
-
-def markdown(text,
- extensions = [],
- safe_mode = False,
- output_format = DEFAULT_OUTPUT_FORMAT):
- """Convert a markdown string to HTML and return HTML as a unicode string.
-
- This is a shortcut function for `Markdown` class to cover the most
- basic use case. It initializes an instance of Markdown, loads the
- necessary extensions and runs the parser on the given text.
-
- Keyword arguments:
-
- * text: Markdown formatted text as Unicode or ASCII string.
- * extensions: A list of extensions or extension names (may contain config args).
- * safe_mode: Disallow raw html. One of "remove", "replace" or "escape".
- * output_format: Format of output. Supported formats are:
- * "xhtml1": Outputs XHTML 1.x. Default.
- * "xhtml": Outputs latest supported version of XHTML (currently XHTML 1.1).
- * "html4": Outputs HTML 4
- * "html": Outputs latest supported version of HTML (currently HTML 4).
- Note that it is suggested that the more specific formats ("xhtml1"
- and "html4") be used as "xhtml" or "html" may change in the future
- if it makes sense at that time.
-
- Returns: An HTML document as a string.
-
- """
- md = Markdown(extensions=load_extensions(extensions),
- safe_mode=safe_mode,
- output_format=output_format)
- return md.convert(text)
-
-
-def markdownFromFile(input = None,
- output = None,
- extensions = [],
- encoding = None,
- safe_mode = False,
- output_format = DEFAULT_OUTPUT_FORMAT):
- """Read markdown code from a file and write it to a file or a stream."""
- md = Markdown(extensions=load_extensions(extensions),
- safe_mode=safe_mode,
- output_format=output_format)
- md.convertFile(input, output, encoding)
-
+from .core import Markdown, markdown, markdownFromFile
+from .__meta__ import __version__, __version_info__ # noqa
+# For backward compatibility as some extensions expect it...
+from .extensions import Extension # noqa
+__all__ = ['Markdown', 'markdown', 'markdownFromFile']
diff --git a/markdown/__main__.py b/markdown/__main__.py
new file mode 100644
index 0000000..0184008
--- /dev/null
+++ b/markdown/__main__.py
@@ -0,0 +1,151 @@
+"""
+Python Markdown
+
+A Python implementation of John Gruber's Markdown.
+
+Documentation: https://python-markdown.github.io/
+GitHub: https://github.com/Python-Markdown/markdown/
+PyPI: https://pypi.org/project/Markdown/
+
+Started by Manfred Stienstra (http://www.dwerg.net/).
+Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
+Currently maintained by Waylan Limberg (https://github.com/waylan),
+Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
+
+Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later)
+Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
+Copyright 2004 Manfred Stienstra (the original version)
+
+License: BSD (see LICENSE.md for details).
+"""
+
+import sys
+import optparse
+import codecs
+import warnings
+import markdown
+try:
+ # We use `unsafe_load` because users may need to pass in actual Python
+ # objects. As this is only available from the CLI, the user has much
+ # worse problems if an attacker can use this as an attach vector.
+ from yaml import unsafe_load as yaml_load
+except ImportError: # pragma: no cover
+ try:
+ # Fall back to PyYAML <5.1
+ from yaml import load as yaml_load
+ except ImportError:
+ # Fall back to JSON
+ from json import load as yaml_load
+
+import logging
+from logging import DEBUG, WARNING, CRITICAL
+
+logger = logging.getLogger('MARKDOWN')
+
+
+def parse_options(args=None, values=None):
+ """
+ Define and parse `optparse` options for command-line usage.
+ """
+ usage = """%prog [options] [INPUTFILE]
+ (STDIN is assumed if no INPUTFILE is given)"""
+ desc = "A Python implementation of John Gruber's Markdown. " \
+ "https://Python-Markdown.github.io/"
+ ver = "%%prog %s" % markdown.__version__
+
+ parser = optparse.OptionParser(usage=usage, description=desc, version=ver)
+ parser.add_option("-f", "--file", dest="filename", default=None,
+ help="Write output to OUTPUT_FILE. Defaults to STDOUT.",
+ metavar="OUTPUT_FILE")
+ parser.add_option("-e", "--encoding", dest="encoding",
+ help="Encoding for input and output files.",)
+ parser.add_option("-o", "--output_format", dest="output_format",
+ default='xhtml', metavar="OUTPUT_FORMAT",
+ help="Use output format 'xhtml' (default) or 'html'.")
+ parser.add_option("-n", "--no_lazy_ol", dest="lazy_ol",
+ action='store_false', default=True,
+ help="Observe number of first item of ordered lists.")
+ parser.add_option("-x", "--extension", action="append", dest="extensions",
+ help="Load extension EXTENSION.", metavar="EXTENSION")
+ parser.add_option("-c", "--extension_configs",
+ dest="configfile", default=None,
+ help="Read extension configurations from CONFIG_FILE. "
+ "CONFIG_FILE must be of JSON or YAML format. YAML "
+ "format requires that a python YAML library be "
+ "installed. The parsed JSON or YAML must result in a "
+ "python dictionary which would be accepted by the "
+ "'extension_configs' keyword on the markdown.Markdown "
+ "class. The extensions must also be loaded with the "
+ "`--extension` option.",
+ metavar="CONFIG_FILE")
+ parser.add_option("-q", "--quiet", default=CRITICAL,
+ action="store_const", const=CRITICAL+10, dest="verbose",
+ help="Suppress all warnings.")
+ parser.add_option("-v", "--verbose",
+ action="store_const", const=WARNING, dest="verbose",
+ help="Print all warnings.")
+ parser.add_option("--noisy",
+ action="store_const", const=DEBUG, dest="verbose",
+ help="Print debug messages.")
+
+ (options, args) = parser.parse_args(args, values)
+
+ if len(args) == 0:
+ input_file = None
+ else:
+ input_file = args[0]
+
+ if not options.extensions:
+ options.extensions = []
+
+ extension_configs = {}
+ if options.configfile:
+ with codecs.open(
+ options.configfile, mode="r", encoding=options.encoding
+ ) as fp:
+ try:
+ extension_configs = yaml_load(fp)
+ except Exception as e:
+ message = "Failed parsing extension config file: %s" % \
+ options.configfile
+ e.args = (message,) + e.args[1:]
+ raise
+
+ opts = {
+ 'input': input_file,
+ 'output': options.filename,
+ 'extensions': options.extensions,
+ 'extension_configs': extension_configs,
+ 'encoding': options.encoding,
+ 'output_format': options.output_format,
+ 'lazy_ol': options.lazy_ol
+ }
+
+ return opts, options.verbose
+
+
+def run(): # pragma: no cover
+ """Run Markdown from the command line."""
+
+ # Parse options and adjust logging level if necessary
+ options, logging_level = parse_options()
+ if not options:
+ sys.exit(2)
+ logger.setLevel(logging_level)
+ console_handler = logging.StreamHandler()
+ logger.addHandler(console_handler)
+ if logging_level <= WARNING:
+ # Ensure deprecation warnings get displayed
+ warnings.filterwarnings('default')
+ logging.captureWarnings(True)
+ warn_logger = logging.getLogger('py.warnings')
+ warn_logger.addHandler(console_handler)
+
+ # Run
+ markdown.markdownFromFile(**options)
+
+
+if __name__ == '__main__': # pragma: no cover
+ # Support running module as a commandline command.
+ # `python -m markdown [options] [args]`.
+ run()
diff --git a/markdown/__meta__.py b/markdown/__meta__.py
new file mode 100644
index 0000000..ccabee5
--- /dev/null
+++ b/markdown/__meta__.py
@@ -0,0 +1,49 @@
+"""
+Python Markdown
+
+A Python implementation of John Gruber's Markdown.
+
+Documentation: https://python-markdown.github.io/
+GitHub: https://github.com/Python-Markdown/markdown/
+PyPI: https://pypi.org/project/Markdown/
+
+Started by Manfred Stienstra (http://www.dwerg.net/).
+Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
+Currently maintained by Waylan Limberg (https://github.com/waylan),
+Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
+
+Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later)
+Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
+Copyright 2004 Manfred Stienstra (the original version)
+
+License: BSD (see LICENSE.md for details).
+"""
+
+# __version_info__ format:
+# (major, minor, patch, dev/alpha/beta/rc/final, #)
+# (1, 1, 2, 'dev', 0) => "1.1.2.dev0"
+# (1, 1, 2, 'alpha', 1) => "1.1.2a1"
+# (1, 2, 0, 'beta', 2) => "1.2b2"
+# (1, 2, 0, 'rc', 4) => "1.2rc4"
+# (1, 2, 0, 'final', 0) => "1.2"
+__version_info__ = (3, 4, 1, 'final', 0)
+
+
+def _get_version(version_info):
+ " Returns a PEP 440-compliant version number from version_info. "
+ assert len(version_info) == 5
+ assert version_info[3] in ('dev', 'alpha', 'beta', 'rc', 'final')
+
+ parts = 2 if version_info[2] == 0 else 3
+ v = '.'.join(map(str, version_info[:parts]))
+
+ if version_info[3] == 'dev':
+ v += '.dev' + str(version_info[4])
+ elif version_info[3] != 'final':
+ mapping = {'alpha': 'a', 'beta': 'b', 'rc': 'rc'}
+ v += mapping[version_info[3]] + str(version_info[4])
+
+ return v
+
+
+__version__ = _get_version(__version_info__)
diff --git a/markdown/blockparser.py b/markdown/blockparser.py
index e18b338..b0ca4b1 100644
--- a/markdown/blockparser.py
+++ b/markdown/blockparser.py
@@ -1,16 +1,38 @@
+"""
+Python Markdown
+
+A Python implementation of John Gruber's Markdown.
+
+Documentation: https://python-markdown.github.io/
+GitHub: https://github.com/Python-Markdown/markdown/
+PyPI: https://pypi.org/project/Markdown/
+
+Started by Manfred Stienstra (http://www.dwerg.net/).
+Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
+Currently maintained by Waylan Limberg (https://github.com/waylan),
+Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
+
+Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later)
+Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
+Copyright 2004 Manfred Stienstra (the original version)
+
+License: BSD (see LICENSE.md for details).
+"""
+
+import xml.etree.ElementTree as etree
+from . import util
-import markdown
class State(list):
- """ Track the current and nested state of the parser.
-
- This utility class is used to track the state of the BlockParser and
+ """ Track the current and nested state of the parser.
+
+ This utility class is used to track the state of the BlockParser and
support multiple levels if nesting. It's just a simple API wrapped around
a list. Each time a state is set, that state is appended to the end of the
list. Each time a state is reset, that state is removed from the end of
the list.
- Therefore, each time a state is set for a nested block, that state must be
+ Therefore, each time a state is set for a nested block, that state must be
reset when we back out of that level of nesting or the state could be
corrupted.
@@ -34,62 +56,64 @@ class State(list):
else:
return False
+
class BlockParser:
- """ Parse Markdown blocks into an ElementTree object.
-
+ """ Parse Markdown blocks into an ElementTree object.
+
A wrapper class that stitches the various BlockProcessors together,
looping through them and creating an ElementTree object.
"""
- def __init__(self):
- self.blockprocessors = markdown.odict.OrderedDict()
+ def __init__(self, md):
+ self.blockprocessors = util.Registry()
self.state = State()
+ self.md = md
def parseDocument(self, lines):
- """ Parse a markdown document into an ElementTree.
-
- Given a list of lines, an ElementTree object (not just a parent Element)
- is created and the root element is passed to the parser as the parent.
- The ElementTree object is returned.
-
+ """ Parse a markdown document into an ElementTree.
+
+ Given a list of lines, an ElementTree object (not just a parent
+ Element) is created and the root element is passed to the parser
+ as the parent. The ElementTree object is returned.
+
This should only be called on an entire document, not pieces.
"""
# Create a ElementTree from the lines
- self.root = markdown.etree.Element(markdown.DOC_TAG)
+ self.root = etree.Element(self.md.doc_tag)
self.parseChunk(self.root, '\n'.join(lines))
- return markdown.etree.ElementTree(self.root)
+ return etree.ElementTree(self.root)
def parseChunk(self, parent, text):
- """ Parse a chunk of markdown text and attach to given etree node.
-
+ """ Parse a chunk of markdown text and attach to given etree node.
+
While the ``text`` argument is generally assumed to contain multiple
blocks which will be split on blank lines, it could contain only one
block. Generally, this method would be called by extensions when
- block parsing is required.
-
- The ``parent`` etree Element passed in is altered in place.
+ block parsing is required.
+
+ The ``parent`` etree Element passed in is altered in place.
Nothing is returned.
"""
self.parseBlocks(parent, text.split('\n\n'))
def parseBlocks(self, parent, blocks):
- """ Process blocks of markdown text and attach to given etree node.
-
+ """ Process blocks of markdown text and attach to given etree node.
+
Given a list of ``blocks``, each blockprocessor is stepped through
until there are no blocks left. While an extension could potentially
- call this method directly, it's generally expected to be used internally.
+ call this method directly, it's generally expected to be used
+ internally.
- This is a public method as an extension may need to add/alter additional
- BlockProcessors which call this method to recursively parse a nested
- block.
+ This is a public method as an extension may need to add/alter
+ additional BlockProcessors which call this method to recursively
+ parse a nested block.
"""
while blocks:
- for processor in self.blockprocessors.values():
- if processor.test(parent, blocks[0]):
- processor.run(parent, blocks)
- break
-
-
+ for processor in self.blockprocessors:
+ if processor.test(parent, blocks[0]):
+ if processor.run(parent, blocks) is not False:
+ # run returns True or None
+ break
diff --git a/markdown/blockprocessors.py b/markdown/blockprocessors.py
index 7d3b137..3d0ff86 100644
--- a/markdown/blockprocessors.py
+++ b/markdown/blockprocessors.py
@@ -1,23 +1,64 @@
"""
+Python Markdown
+
+A Python implementation of John Gruber's Markdown.
+
+Documentation: https://python-markdown.github.io/
+GitHub: https://github.com/Python-Markdown/markdown/
+PyPI: https://pypi.org/project/Markdown/
+
+Started by Manfred Stienstra (http://www.dwerg.net/).
+Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
+Currently maintained by Waylan Limberg (https://github.com/waylan),
+Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
+
+Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later)
+Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
+Copyright 2004 Manfred Stienstra (the original version)
+
+License: BSD (see LICENSE.md for details).
+
CORE MARKDOWN BLOCKPARSER
-=============================================================================
+===========================================================================
-This parser handles basic parsing of Markdown blocks. It doesn't concern itself
-with inline elements such as **bold** or *italics*, but rather just catches
-blocks, lists, quotes, etc.
+This parser handles basic parsing of Markdown blocks. It doesn't concern
+itself with inline elements such as **bold** or *italics*, but rather just
+catches blocks, lists, quotes, etc.
-The BlockParser is made up of a bunch of BlockProssors, each handling a
+The BlockParser is made up of a bunch of BlockProcessors, each handling a
different type of block. Extensions may add/replace/remove BlockProcessors
as they need to alter how markdown blocks are parsed.
-
"""
+import logging
import re
-import markdown
+import xml.etree.ElementTree as etree
+from . import util
+from .blockparser import BlockParser
+
+logger = logging.getLogger('MARKDOWN')
+
+
+def build_block_parser(md, **kwargs):
+ """ Build the default block parser used by Markdown. """
+ parser = BlockParser(md)
+ parser.blockprocessors.register(EmptyBlockProcessor(parser), 'empty', 100)
+ parser.blockprocessors.register(ListIndentProcessor(parser), 'indent', 90)
+ parser.blockprocessors.register(CodeBlockProcessor(parser), 'code', 80)
+ parser.blockprocessors.register(HashHeaderProcessor(parser), 'hashheader', 70)
+ parser.blockprocessors.register(SetextHeaderProcessor(parser), 'setextheader', 60)
+ parser.blockprocessors.register(HRProcessor(parser), 'hr', 50)
+ parser.blockprocessors.register(OListProcessor(parser), 'olist', 40)
+ parser.blockprocessors.register(UListProcessor(parser), 'ulist', 30)
+ parser.blockprocessors.register(BlockQuoteProcessor(parser), 'quote', 20)
+ parser.blockprocessors.register(ReferenceProcessor(parser), 'reference', 15)
+ parser.blockprocessors.register(ParagraphProcessor(parser), 'paragraph', 10)
+ return parser
+
class BlockProcessor:
- """ Base class for block processors.
-
+ """ Base class for block processors.
+
Each subclass will provide the methods below to work with the source and
tree. Each processor will need to define it's own ``test`` and ``run``
methods. The ``test`` method should return True or False, to indicate
@@ -26,8 +67,9 @@ class BlockProcessor:
"""
- def __init__(self, parser=None):
+ def __init__(self, parser):
self.parser = parser
+ self.tab_length = parser.md.tab_length
def lastChild(self, parent):
""" Return the last child of an etree element. """
@@ -36,13 +78,15 @@ class BlockProcessor:
else:
return None
- def detab(self, text):
+ def detab(self, text, length=None):
""" Remove a tab from the front of each line of the given text. """
+ if length is None:
+ length = self.tab_length
newtext = []
lines = text.split('\n')
for line in lines:
- if line.startswith(' '*markdown.TAB_LENGTH):
- newtext.append(line[markdown.TAB_LENGTH:])
+ if line.startswith(' ' * length):
+ newtext.append(line[length:])
elif not line.strip():
newtext.append('')
else:
@@ -53,37 +97,37 @@ class BlockProcessor:
""" Remove a tab from front of lines but allowing dedented lines. """
lines = text.split('\n')
for i in range(len(lines)):
- if lines[i].startswith(' '*markdown.TAB_LENGTH*level):
- lines[i] = lines[i][markdown.TAB_LENGTH*level:]
+ if lines[i].startswith(' '*self.tab_length*level):
+ lines[i] = lines[i][self.tab_length*level:]
return '\n'.join(lines)
def test(self, parent, block):
- """ Test for block type. Must be overridden by subclasses.
-
- As the parser loops through processors, it will call the ``test`` method
- on each to determine if the given block of text is of that type. This
- method must return a boolean ``True`` or ``False``. The actual method of
- testing is left to the needs of that particular block type. It could
- be as simple as ``block.startswith(some_string)`` or a complex regular
- expression. As the block type may be different depending on the parent
- of the block (i.e. inside a list), the parent etree element is also
- provided and may be used as part of the test.
+ """ Test for block type. Must be overridden by subclasses.
+
+ As the parser loops through processors, it will call the ``test``
+ method on each to determine if the given block of text is of that
+ type. This method must return a boolean ``True`` or ``False``. The
+ actual method of testing is left to the needs of that particular
+ block type. It could be as simple as ``block.startswith(some_string)``
+ or a complex regular expression. As the block type may be different
+ depending on the parent of the block (i.e. inside a list), the parent
+ etree element is also provided and may be used as part of the test.
Keywords:
-
+
* ``parent``: A etree element which will be the parent of the block.
- * ``block``: A block of text from the source which has been split at
+ * ``block``: A block of text from the source which has been split at
blank lines.
"""
- pass
+ pass # pragma: no cover
def run(self, parent, blocks):
- """ Run processor. Must be overridden by subclasses.
-
+ """ Run processor. Must be overridden by subclasses.
+
When the parser determines the appropriate type of a block, the parser
will call the corresponding processor's ``run`` method. This method
should parse the individual lines of the block and append them to
- the etree.
+ the etree.
Note that both the ``parent`` and ``etree`` keywords are pointers
to instances of the objects which should be edited in place. Each
@@ -99,12 +143,12 @@ class BlockProcessor:
* ``parent``: A etree element which is the parent of the current block.
* ``blocks``: A list of all remaining blocks of the document.
"""
- pass
+ pass # pragma: no cover
class ListIndentProcessor(BlockProcessor):
- """ Process children of list items.
-
+ """ Process children of list items.
+
Example:
* a list item
process this part
@@ -113,18 +157,19 @@ class ListIndentProcessor(BlockProcessor):
"""
- INDENT_RE = re.compile(r'^(([ ]{%s})+)'% markdown.TAB_LENGTH)
ITEM_TYPES = ['li']
LIST_TYPES = ['ul', 'ol']
+ def __init__(self, *args):
+ super().__init__(*args)
+ self.INDENT_RE = re.compile(r'^(([ ]{%s})+)' % self.tab_length)
+
def test(self, parent, block):
- return block.startswith(' '*markdown.TAB_LENGTH) and \
- not self.parser.state.isstate('detabbed') and \
- (parent.tag in self.ITEM_TYPES or \
- (len(parent) and parent[-1] and \
- (parent[-1].tag in self.LIST_TYPES)
- )
- )
+ return block.startswith(' '*self.tab_length) and \
+ not self.parser.state.isstate('detabbed') and \
+ (parent.tag in self.ITEM_TYPES or
+ (len(parent) and parent[-1] is not None and
+ (parent[-1].tag in self.LIST_TYPES)))
def run(self, parent, blocks):
block = blocks.pop(0)
@@ -133,8 +178,16 @@ class ListIndentProcessor(BlockProcessor):
self.parser.state.set('detabbed')
if parent.tag in self.ITEM_TYPES:
- # The parent is already a li. Just parse the child block.
- self.parser.parseBlocks(parent, [block])
+ # It's possible that this parent has a 'ul' or 'ol' child list
+ # with a member. If that is the case, then that should be the
+ # parent. This is intended to catch the edge case of an indented
+ # list whose first member was parsed previous to this point
+ # see OListProcessor
+ if len(parent) and parent[-1].tag in self.LIST_TYPES:
+ self.parser.parseBlocks(parent[-1], [block])
+ else:
+ # The parent is already a li. Just parse the child block.
+ self.parser.parseBlocks(parent, [block])
elif sibling.tag in self.ITEM_TYPES:
# The sibling is a li. Use it as parent.
self.parser.parseBlocks(sibling, [block])
@@ -143,8 +196,12 @@ class ListIndentProcessor(BlockProcessor):
# Assume the last child li is the parent of this block.
if sibling[-1].text:
# If the parent li has text, that text needs to be moved to a p
- block = '%s\n\n%s' % (sibling[-1].text, block)
+ # The p must be 'inserted' at beginning of list in the event
+ # that other children already exist i.e.; a nested sublist.
+ p = etree.Element('p')
+ p.text = sibling[-1].text
sibling[-1].text = ''
+ sibling[-1].insert(0, p)
self.parser.parseChunk(sibling[-1], block)
else:
self.create_item(sibling, block)
@@ -152,15 +209,15 @@ class ListIndentProcessor(BlockProcessor):
def create_item(self, parent, block):
""" Create a new li and parse the block with it as the parent. """
- li = markdown.etree.SubElement(parent, 'li')
+ li = etree.SubElement(parent, 'li')
self.parser.parseBlocks(li, [block])
-
+
def get_level(self, parent, block):
""" Get level of indent based on list level. """
# Get indent level
m = self.INDENT_RE.match(block)
if m:
- indent_level = len(m.group(1))/markdown.TAB_LENGTH
+ indent_level = len(m.group(1))/self.tab_length
else:
indent_level = 0
if self.parser.state.isstate('list'):
@@ -172,7 +229,8 @@ class ListIndentProcessor(BlockProcessor):
# Step through children of tree to find matching indent level.
while indent_level > level:
child = self.lastChild(parent)
- if child and (child.tag in self.LIST_TYPES or child.tag in self.ITEM_TYPES):
+ if (child is not None and
+ (child.tag in self.LIST_TYPES or child.tag in self.ITEM_TYPES)):
if child.tag in self.LIST_TYPES:
level += 1
parent = child
@@ -187,28 +245,30 @@ class CodeBlockProcessor(BlockProcessor):
""" Process code blocks. """
def test(self, parent, block):
- return block.startswith(' '*markdown.TAB_LENGTH)
-
+ return block.startswith(' '*self.tab_length)
+
def run(self, parent, blocks):
sibling = self.lastChild(parent)
block = blocks.pop(0)
theRest = ''
- if sibling and sibling.tag == "pre" and len(sibling) \
- and sibling[0].tag == "code":
+ if (sibling is not None and sibling.tag == "pre" and
+ len(sibling) and sibling[0].tag == "code"):
# The previous block was a code block. As blank lines do not start
# new code blocks, append this block to the previous, adding back
# linebreaks removed from the split into a list.
code = sibling[0]
block, theRest = self.detab(block)
- code.text = markdown.AtomicString('%s\n%s\n' % (code.text, block.rstrip()))
+ code.text = util.AtomicString(
+ '{}\n{}\n'.format(code.text, util.code_escape(block.rstrip()))
+ )
else:
# This is a new codeblock. Create the elements and insert text.
- pre = markdown.etree.SubElement(parent, 'pre')
- code = markdown.etree.SubElement(pre, 'code')
+ pre = etree.SubElement(parent, 'pre')
+ code = etree.SubElement(pre, 'code')
block, theRest = self.detab(block)
- code.text = markdown.AtomicString('%s\n' % block.rstrip())
+ code.text = util.AtomicString('%s\n' % util.code_escape(block.rstrip()))
if theRest:
- # This block contained unindented line(s) after the first indented
+ # This block contained unindented line(s) after the first indented
# line. Insert these lines as the first block of the master blocks
# list for future processing.
blocks.insert(0, theRest)
@@ -219,27 +279,31 @@ class BlockQuoteProcessor(BlockProcessor):
RE = re.compile(r'(^|\n)[ ]{0,3}>[ ]?(.*)')
def test(self, parent, block):
- return bool(self.RE.search(block))
+ return bool(self.RE.search(block)) and not util.nearing_recursion_limit()
def run(self, parent, blocks):
block = blocks.pop(0)
m = self.RE.search(block)
if m:
- before = block[:m.start()] # Lines before blockquote
- # Pass lines before blockquote in recursively for parsing forst.
+ before = block[:m.start()] # Lines before blockquote
+ # Pass lines before blockquote in recursively for parsing first.
self.parser.parseBlocks(parent, [before])
- # Remove ``> `` from begining of each line.
- block = '\n'.join([self.clean(line) for line in
- block[m.start():].split('\n')])
+ # Remove ``> `` from beginning of each line.
+ block = '\n'.join(
+ [self.clean(line) for line in block[m.start():].split('\n')]
+ )
sibling = self.lastChild(parent)
- if sibling and sibling.tag == "blockquote":
+ if sibling is not None and sibling.tag == "blockquote":
# Previous block was a blockquote so set that as this blocks parent
quote = sibling
else:
# This is a new blockquote. Create a new parent element.
- quote = markdown.etree.SubElement(parent, 'blockquote')
+ quote = etree.SubElement(parent, 'blockquote')
# Recursively parse block with blockquote as parent.
+ # change parser state so blockquotes embedded in lists use p tags
+ self.parser.state.set('blockquote')
self.parser.parseChunk(quote, block)
+ self.parser.state.reset()
def clean(self, line):
""" Remove ``>`` from beginning of a line. """
@@ -251,16 +315,31 @@ class BlockQuoteProcessor(BlockProcessor):
else:
return line
+
class OListProcessor(BlockProcessor):
""" Process ordered list blocks. """
TAG = 'ol'
- # Detect an item (``1. item``). ``group(1)`` contains contents of item.
- RE = re.compile(r'^[ ]{0,3}\d+\.[ ]+(.*)')
- # Detect items on secondary lines. they can be of either list type.
- CHILD_RE = re.compile(r'^[ ]{0,3}((\d+\.)|[*+-])[ ]+(.*)')
- # Detect indented (nested) items of either type
- INDENT_RE = re.compile(r'^[ ]{4,7}((\d+\.)|[*+-])[ ]+.*')
+ # The integer (python string) with which the lists starts (default=1)
+ # Eg: If list is initialized as)
+ # 3. Item
+ # The ol tag will get starts="3" attribute
+ STARTSWITH = '1'
+ # Lazy ol - ignore startswith
+ LAZY_OL = True
+ # List of allowed sibling tags.
+ SIBLING_TAGS = ['ol', 'ul']
+
+ def __init__(self, parser):
+ super().__init__(parser)
+ # Detect an item (``1. item``). ``group(1)`` contains contents of item.
+ self.RE = re.compile(r'^[ ]{0,%d}\d+\.[ ]+(.*)' % (self.tab_length - 1))
+ # Detect items on secondary lines. they can be of either list type.
+ self.CHILD_RE = re.compile(r'^[ ]{0,%d}((\d+\.)|[*+-])[ ]+(.*)' %
+ (self.tab_length - 1))
+ # Detect indented (nested) items of either type
+ self.INDENT_RE = re.compile(r'^[ ]{%d,%d}((\d+\.)|[*+-])[ ]+.*' %
+ (self.tab_length, self.tab_length * 2 - 1))
def test(self, parent, block):
return bool(self.RE.match(block))
@@ -269,33 +348,58 @@ class OListProcessor(BlockProcessor):
# Check fr multiple items in one block.
items = self.get_items(blocks.pop(0))
sibling = self.lastChild(parent)
- if sibling and sibling.tag in ['ol', 'ul']:
+
+ if sibling is not None and sibling.tag in self.SIBLING_TAGS:
# Previous block was a list item, so set that as parent
lst = sibling
- # make sure previous item is in a p.
- if len(lst) and lst[-1].text and not len(lst[-1]):
- p = markdown.etree.SubElement(lst[-1], 'p')
+ # make sure previous item is in a p- if the item has text,
+ # then it isn't in a p
+ if lst[-1].text:
+ # since it's possible there are other children for this
+ # sibling, we can't just SubElement the p, we need to
+ # insert it as the first item.
+ p = etree.Element('p')
p.text = lst[-1].text
lst[-1].text = ''
+ lst[-1].insert(0, p)
+ # if the last item has a tail, then the tail needs to be put in a p
+ # likely only when a header is not followed by a blank line
+ lch = self.lastChild(lst[-1])
+ if lch is not None and lch.tail:
+ p = etree.SubElement(lst[-1], 'p')
+ p.text = lch.tail.lstrip()
+ lch.tail = ''
+
# parse first block differently as it gets wrapped in a p.
- li = markdown.etree.SubElement(lst, 'li')
+ li = etree.SubElement(lst, 'li')
self.parser.state.set('looselist')
firstitem = items.pop(0)
self.parser.parseBlocks(li, [firstitem])
self.parser.state.reset()
+ elif parent.tag in ['ol', 'ul']:
+ # this catches the edge case of a multi-item indented list whose
+ # first item is in a blank parent-list item:
+ # * * subitem1
+ # * subitem2
+ # see also ListIndentProcessor
+ lst = parent
else:
# This is a new list so create parent with appropriate tag.
- lst = markdown.etree.SubElement(parent, self.TAG)
+ lst = etree.SubElement(parent, self.TAG)
+ # Check if a custom start integer is set
+ if not self.LAZY_OL and self.STARTSWITH != '1':
+ lst.attrib['start'] = self.STARTSWITH
+
self.parser.state.set('list')
# Loop through items in block, recursively parsing each with the
# appropriate parent.
for item in items:
- if item.startswith(' '*markdown.TAB_LENGTH):
+ if item.startswith(' '*self.tab_length):
# Item is indented. Parse with last item as parent
self.parser.parseBlocks(lst[-1], [item])
else:
# New item. Create li and parse with it as parent
- li = markdown.etree.SubElement(lst, 'li')
+ li = etree.SubElement(lst, 'li')
self.parser.parseBlocks(li, [item])
self.parser.state.reset()
@@ -305,18 +409,24 @@ class OListProcessor(BlockProcessor):
for line in block.split('\n'):
m = self.CHILD_RE.match(line)
if m:
- # This is a new item. Append
+ # This is a new list item
+ # Check first item for the start index
+ if not items and self.TAG == 'ol':
+ # Detect the integer value of first list item
+ INTEGER_RE = re.compile(r'(\d+)')
+ self.STARTSWITH = INTEGER_RE.match(m.group(1)).group()
+ # Append to the list
items.append(m.group(3))
elif self.INDENT_RE.match(line):
# This is an indented (possibly nested) item.
- if items[-1].startswith(' '*markdown.TAB_LENGTH):
+ if items[-1].startswith(' '*self.tab_length):
# Previous item was indented. Append to that item.
- items[-1] = '%s\n%s' % (items[-1], line)
+ items[-1] = '{}\n{}'.format(items[-1], line)
else:
items.append(line)
else:
# This is another line of previous item. Append to that item.
- items[-1] = '%s\n%s' % (items[-1], line)
+ items[-1] = '{}\n{}'.format(items[-1], line)
return items
@@ -324,14 +434,18 @@ class UListProcessor(OListProcessor):
""" Process unordered list blocks. """
TAG = 'ul'
- RE = re.compile(r'^[ ]{0,3}[*+-][ ]+(.*)')
+
+ def __init__(self, parser):
+ super().__init__(parser)
+ # Detect an item (``1. item``). ``group(1)`` contains contents of item.
+ self.RE = re.compile(r'^[ ]{0,%d}[*+-][ ]+(.*)' % (self.tab_length - 1))
class HashHeaderProcessor(BlockProcessor):
""" Process Hash Headers. """
# Detect a header at start of any line in block
- RE = re.compile(r'(^|\n)(?P<level>#{1,6})(?P<header>.*?)#*(\n|$)')
+ RE = re.compile(r'(?:^|\n)(?P<level>#{1,6})(?P<header>(?:\\.|[^\\])*?)#*(?:\n|$)')
def test(self, parent, block):
return bool(self.RE.search(block))
@@ -340,29 +454,29 @@ class HashHeaderProcessor(BlockProcessor):
block = blocks.pop(0)
m = self.RE.search(block)
if m:
- before = block[:m.start()] # All lines before header
- after = block[m.end():] # All lines after header
+ before = block[:m.start()] # All lines before header
+ after = block[m.end():] # All lines after header
if before:
# As the header was not the first line of the block and the
# lines before the header must be parsed first,
# recursively parse this lines as a block.
self.parser.parseBlocks(parent, [before])
# Create header using named groups from RE
- h = markdown.etree.SubElement(parent, 'h%d' % len(m.group('level')))
+ h = etree.SubElement(parent, 'h%d' % len(m.group('level')))
h.text = m.group('header').strip()
if after:
# Insert remaining lines as first block for future parsing.
blocks.insert(0, after)
- else:
+ else: # pragma: no cover
# This should never happen, but just in case...
- message(CRITICAL, "We've got a problem header!")
+ logger.warn("We've got a problem header: %r" % block)
class SetextHeaderProcessor(BlockProcessor):
""" Process Setext-style Headers. """
# Detect Setext-style header. Must be first 2 lines of block.
- RE = re.compile(r'^.*?\n[=-]{3,}', re.MULTILINE)
+ RE = re.compile(r'^.*?\n[=-]+[ ]*(\n|$)', re.MULTILINE)
def test(self, parent, block):
return bool(self.RE.match(block))
@@ -374,7 +488,7 @@ class SetextHeaderProcessor(BlockProcessor):
level = 1
else:
level = 2
- h = markdown.etree.SubElement(parent, 'h%d' % level)
+ h = etree.SubElement(parent, 'h%d' % level)
h.text = lines[0].strip()
if len(lines) > 2:
# Block contains additional lines. Add to master blocks for later.
@@ -384,58 +498,91 @@ class SetextHeaderProcessor(BlockProcessor):
class HRProcessor(BlockProcessor):
""" Process Horizontal Rules. """
- RE = r'[ ]{0,3}(?P<ch>[*_-])[ ]?((?P=ch)[ ]?){2,}[ ]*'
+ # Python's re module doesn't officially support atomic grouping. However you can fake it.
+ # See https://stackoverflow.com/a/13577411/866026
+ RE = r'^[ ]{0,3}(?=(?P<atomicgroup>(-+[ ]{0,2}){3,}|(_+[ ]{0,2}){3,}|(\*+[ ]{0,2}){3,}))(?P=atomicgroup)[ ]*$'
# Detect hr on any line of a block.
- SEARCH_RE = re.compile(r'(^|\n)%s(\n|$)' % RE)
- # Match a hr on a single line of text.
- MATCH_RE = re.compile(r'^%s$' % RE)
+ SEARCH_RE = re.compile(RE, re.MULTILINE)
def test(self, parent, block):
- return bool(self.SEARCH_RE.search(block))
+ m = self.SEARCH_RE.search(block)
+ if m:
+ # Save match object on class instance so we can use it later.
+ self.match = m
+ return True
+ return False
def run(self, parent, blocks):
- lines = blocks.pop(0).split('\n')
- prelines = []
+ block = blocks.pop(0)
+ match = self.match
# Check for lines in block before hr.
- for line in lines:
- m = self.MATCH_RE.match(line)
- if m:
- break
- else:
- prelines.append(line)
- if len(prelines):
+ prelines = block[:match.start()].rstrip('\n')
+ if prelines:
# Recursively parse lines before hr so they get parsed first.
- self.parser.parseBlocks(parent, ['\n'.join(prelines)])
+ self.parser.parseBlocks(parent, [prelines])
# create hr
- hr = markdown.etree.SubElement(parent, 'hr')
+ etree.SubElement(parent, 'hr')
# check for lines in block after hr.
- lines = lines[len(prelines)+1:]
- if len(lines):
+ postlines = block[match.end():].lstrip('\n')
+ if postlines:
# Add lines after hr to master blocks for later parsing.
- blocks.insert(0, '\n'.join(lines))
+ blocks.insert(0, postlines)
class EmptyBlockProcessor(BlockProcessor):
- """ Process blocks and start with an empty line. """
+ """ Process blocks that are empty or start with an empty line. """
- # Detect a block that only contains whitespace
- # or only whitespace on the first line.
- RE = re.compile(r'^\s*\n')
+ def test(self, parent, block):
+ return not block or block.startswith('\n')
+
+ def run(self, parent, blocks):
+ block = blocks.pop(0)
+ filler = '\n\n'
+ if block:
+ # Starts with empty line
+ # Only replace a single line.
+ filler = '\n'
+ # Save the rest for later.
+ theRest = block[1:]
+ if theRest:
+ # Add remaining lines to master blocks for later.
+ blocks.insert(0, theRest)
+ sibling = self.lastChild(parent)
+ if (sibling is not None and sibling.tag == 'pre' and
+ len(sibling) and sibling[0].tag == 'code'):
+ # Last block is a codeblock. Append to preserve whitespace.
+ sibling[0].text = util.AtomicString(
+ '{}{}'.format(sibling[0].text, filler)
+ )
+
+
+class ReferenceProcessor(BlockProcessor):
+ """ Process link references. """
+ RE = re.compile(
+ r'^[ ]{0,3}\[([^\[\]]*)\]:[ ]*\n?[ ]*([^\s]+)[ ]*(?:\n[ ]*)?((["\'])(.*)\4[ ]*|\((.*)\)[ ]*)?$', re.MULTILINE
+ )
def test(self, parent, block):
- return bool(self.RE.match(block))
+ return True
def run(self, parent, blocks):
block = blocks.pop(0)
- m = self.RE.match(block)
+ m = self.RE.search(block)
if m:
- # Add remaining line to master blocks for later.
- blocks.insert(0, block[m.end():])
- sibling = self.lastChild(parent)
- if sibling and sibling.tag == 'pre' and sibling[0] and \
- sibling[0].tag == 'code':
- # Last block is a codeblock. Append to preserve whitespace.
- sibling[0].text = markdown.AtomicString('%s/n/n/n' % sibling[0].text )
+ id = m.group(1).strip().lower()
+ link = m.group(2).lstrip('<').rstrip('>')
+ title = m.group(5) or m.group(6)
+ self.parser.md.references[id] = (link, title)
+ if block[m.end():].strip():
+ # Add any content after match back to blocks as separate block
+ blocks.insert(0, block[m.end():].lstrip('\n'))
+ if block[:m.start()].strip():
+ # Add any content before match back to blocks as separate block
+ blocks.insert(0, block[:m.start()].rstrip('\n'))
+ return True
+ # No match. Restore block.
+ blocks.insert(0, block)
+ return False
class ParagraphProcessor(BlockProcessor):
@@ -449,12 +596,28 @@ class ParagraphProcessor(BlockProcessor):
if block.strip():
# Not a blank block. Add to parent, otherwise throw it away.
if self.parser.state.isstate('list'):
- # The parent is a tight-list. Append to parent.text
- if parent.text:
- parent.text = '%s\n%s' % (parent.text, block)
+ # The parent is a tight-list.
+ #
+ # Check for any children. This will likely only happen in a
+ # tight-list when a header isn't followed by a blank line.
+ # For example:
+ #
+ # * # Header
+ # Line 2 of list item - not part of header.
+ sibling = self.lastChild(parent)
+ if sibling is not None:
+ # Insetrt after sibling.
+ if sibling.tail:
+ sibling.tail = '{}\n{}'.format(sibling.tail, block)
+ else:
+ sibling.tail = '\n%s' % block
else:
- parent.text = block.lstrip()
+ # Append to parent.text
+ if parent.text:
+ parent.text = '{}\n{}'.format(parent.text, block)
+ else:
+ parent.text = block.lstrip()
else:
# Create a regular paragraph
- p = markdown.etree.SubElement(parent, 'p')
+ p = etree.SubElement(parent, 'p')
p.text = block.lstrip()
diff --git a/markdown/commandline.py b/markdown/commandline.py
deleted file mode 100644
index 1eedc6d..0000000
--- a/markdown/commandline.py
+++ /dev/null
@@ -1,96 +0,0 @@
-"""
-COMMAND-LINE SPECIFIC STUFF
-=============================================================================
-
-The rest of the code is specifically for handling the case where Python
-Markdown is called from the command line.
-"""
-
-import markdown
-import sys
-import logging
-from logging import DEBUG, INFO, WARN, ERROR, CRITICAL
-
-EXECUTABLE_NAME_FOR_USAGE = "python markdown.py"
-""" The name used in the usage statement displayed for python versions < 2.3.
-(With python 2.3 and higher the usage statement is generated by optparse
-and uses the actual name of the executable called.) """
-
-OPTPARSE_WARNING = """
-Python 2.3 or higher required for advanced command line options.
-For lower versions of Python use:
-
- %s INPUT_FILE > OUTPUT_FILE
-
-""" % EXECUTABLE_NAME_FOR_USAGE
-
-def parse_options():
- """
- Define and parse `optparse` options for command-line usage.
- """
-
- try:
- optparse = __import__("optparse")
- except:
- if len(sys.argv) == 2:
- return {'input': sys.argv[1],
- 'output': None,
- 'safe': False,
- 'extensions': [],
- 'encoding': None }, CRITICAL
- else:
- print OPTPARSE_WARNING
- return None, None
-
- parser = optparse.OptionParser(usage="%prog INPUTFILE [options]")
- parser.add_option("-f", "--file", dest="filename", default=sys.stdout,
- help="write output to OUTPUT_FILE",
- metavar="OUTPUT_FILE")
- parser.add_option("-e", "--encoding", dest="encoding",
- help="encoding for input and output files",)
- parser.add_option("-q", "--quiet", default = CRITICAL,
- action="store_const", const=CRITICAL+10, dest="verbose",
- help="suppress all messages")
- parser.add_option("-v", "--verbose",
- action="store_const", const=INFO, dest="verbose",
- help="print info messages")
- parser.add_option("-s", "--safe", dest="safe", default=False,
- metavar="SAFE_MODE",
- help="safe mode ('replace', 'remove' or 'escape' user's HTML tag)")
- parser.add_option("-o", "--output_format", dest="output_format",
- default='xhtml1', metavar="OUTPUT_FORMAT",
- help="Format of output. One of 'xhtml1' (default) or 'html4'.")
- parser.add_option("--noisy",
- action="store_const", const=DEBUG, dest="verbose",
- help="print debug messages")
- parser.add_option("-x", "--extension", action="append", dest="extensions",
- help = "load extension EXTENSION", metavar="EXTENSION")
-
- (options, args) = parser.parse_args()
-
- if not len(args) == 1:
- parser.print_help()
- return None, None
- else:
- input_file = args[0]
-
- if not options.extensions:
- options.extensions = []
-
- return {'input': input_file,
- 'output': options.filename,
- 'safe_mode': options.safe,
- 'extensions': options.extensions,
- 'encoding': options.encoding,
- 'output_format': options.output_format}, options.verbose
-
-def run():
- """Run Markdown from the command line."""
-
- # Parse options and adjust logging level if necessary
- options, logging_level = parse_options()
- if not options: sys.exit(0)
- if logging_level: logging.getLogger('MARKDOWN').setLevel(logging_level)
-
- # Run
- markdown.markdownFromFile(**options)
diff --git a/markdown/core.py b/markdown/core.py
new file mode 100644
index 0000000..f6a171c
--- /dev/null
+++ b/markdown/core.py
@@ -0,0 +1,407 @@
+"""
+Python Markdown
+
+A Python implementation of John Gruber's Markdown.
+
+Documentation: https://python-markdown.github.io/
+GitHub: https://github.com/Python-Markdown/markdown/
+PyPI: https://pypi.org/project/Markdown/
+
+Started by Manfred Stienstra (http://www.dwerg.net/).
+Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
+Currently maintained by Waylan Limberg (https://github.com/waylan),
+Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
+
+Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later)
+Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
+Copyright 2004 Manfred Stienstra (the original version)
+
+License: BSD (see LICENSE.md for details).
+"""
+
+import codecs
+import sys
+import logging
+import importlib
+from . import util
+from .preprocessors import build_preprocessors
+from .blockprocessors import build_block_parser
+from .treeprocessors import build_treeprocessors
+from .inlinepatterns import build_inlinepatterns
+from .postprocessors import build_postprocessors
+from .extensions import Extension
+from .serializers import to_html_string, to_xhtml_string
+
+__all__ = ['Markdown', 'markdown', 'markdownFromFile']
+
+
+logger = logging.getLogger('MARKDOWN')
+
+
+class Markdown:
+ """Convert Markdown to HTML."""
+
+ doc_tag = "div" # Element used to wrap document - later removed
+
+ output_formats = {
+ 'html': to_html_string,
+ 'xhtml': to_xhtml_string,
+ }
+
+ def __init__(self, **kwargs):
+ """
+ Creates a new Markdown instance.
+
+ Keyword arguments:
+
+ * extensions: A list of extensions.
+ If an item is an instance of a subclass of `markdown.extension.Extension`, the instance will be used
+ as-is. If an item is of type string, first an entry point will be loaded. If that fails, the string is
+ assumed to use Python dot notation (`path.to.module:ClassName`) to load a markdown.Extension subclass. If
+ no class is specified, then a `makeExtension` function is called within the specified module.
+ * extension_configs: Configuration settings for extensions.
+ * output_format: Format of output. Supported formats are:
+ * "xhtml": Outputs XHTML style tags. Default.
+ * "html": Outputs HTML style tags.
+ * tab_length: Length of tabs in the source. Default: 4
+
+ """
+
+ self.tab_length = kwargs.get('tab_length', 4)
+
+ self.ESCAPED_CHARS = ['\\', '`', '*', '_', '{', '}', '[', ']',
+ '(', ')', '>', '#', '+', '-', '.', '!']
+
+ self.block_level_elements = [
+ # Elements which are invalid to wrap in a `<p>` tag.
+ # See https://w3c.github.io/html/grouping-content.html#the-p-element
+ 'address', 'article', 'aside', 'blockquote', 'details', 'div', 'dl',
+ 'fieldset', 'figcaption', 'figure', 'footer', 'form', 'h1', 'h2', 'h3',
+ 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'main', 'menu', 'nav', 'ol',
+ 'p', 'pre', 'section', 'table', 'ul',
+ # Other elements which Markdown should not be mucking up the contents of.
+ 'canvas', 'colgroup', 'dd', 'body', 'dt', 'group', 'iframe', 'li', 'legend',
+ 'math', 'map', 'noscript', 'output', 'object', 'option', 'progress', 'script',
+ 'style', 'summary', 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'video'
+ ]
+
+ self.registeredExtensions = []
+ self.docType = ""
+ self.stripTopLevelTags = True
+
+ self.build_parser()
+
+ self.references = {}
+ self.htmlStash = util.HtmlStash()
+ self.registerExtensions(extensions=kwargs.get('extensions', []),
+ configs=kwargs.get('extension_configs', {}))
+ self.set_output_format(kwargs.get('output_format', 'xhtml'))
+ self.reset()
+
+ def build_parser(self):
+ """ Build the parser from the various parts. """
+ self.preprocessors = build_preprocessors(self)
+ self.parser = build_block_parser(self)
+ self.inlinePatterns = build_inlinepatterns(self)
+ self.treeprocessors = build_treeprocessors(self)
+ self.postprocessors = build_postprocessors(self)
+ return self
+
+ def registerExtensions(self, extensions, configs):
+ """
+ Register extensions with this instance of Markdown.
+
+ Keyword arguments:
+
+ * extensions: A list of extensions, which can either
+ be strings or objects.
+ * configs: A dictionary mapping extension names to config options.
+
+ """
+ for ext in extensions:
+ if isinstance(ext, str):
+ ext = self.build_extension(ext, configs.get(ext, {}))
+ if isinstance(ext, Extension):
+ ext.extendMarkdown(self)
+ logger.debug(
+ 'Successfully loaded extension "%s.%s".'
+ % (ext.__class__.__module__, ext.__class__.__name__)
+ )
+ elif ext is not None:
+ raise TypeError(
+ 'Extension "{}.{}" must be of type: "{}.{}"'.format(
+ ext.__class__.__module__, ext.__class__.__name__,
+ Extension.__module__, Extension.__name__
+ )
+ )
+ return self
+
+ def build_extension(self, ext_name, configs):
+ """
+ Build extension from a string name, then return an instance.
+
+ First attempt to load an entry point. The string name must be registered as an entry point in the
+ `markdown.extensions` group which points to a subclass of the `markdown.extensions.Extension` class.
+ If multiple distributions have registered the same name, the first one found is returned.
+
+ If no entry point is found, assume dot notation (`path.to.module:ClassName`). Load the specified class and
+ return an instance. If no class is specified, import the module and call a `makeExtension` function and return
+ the Extension instance returned by that function.
+ """
+ configs = dict(configs)
+
+ entry_points = [ep for ep in util.get_installed_extensions() if ep.name == ext_name]
+ if entry_points:
+ ext = entry_points[0].load()
+ return ext(**configs)
+
+ # Get class name (if provided): `path.to.module:ClassName`
+ ext_name, class_name = ext_name.split(':', 1) if ':' in ext_name else (ext_name, '')
+
+ try:
+ module = importlib.import_module(ext_name)
+ logger.debug(
+ 'Successfully imported extension module "%s".' % ext_name
+ )
+ except ImportError as e:
+ message = 'Failed loading extension "%s".' % ext_name
+ e.args = (message,) + e.args[1:]
+ raise
+
+ if class_name:
+ # Load given class name from module.
+ return getattr(module, class_name)(**configs)
+ else:
+ # Expect makeExtension() function to return a class.
+ try:
+ return module.makeExtension(**configs)
+ except AttributeError as e:
+ message = e.args[0]
+ message = "Failed to initiate extension " \
+ "'%s': %s" % (ext_name, message)
+ e.args = (message,) + e.args[1:]
+ raise
+
+ def registerExtension(self, extension):
+ """ This gets called by the extension """
+ self.registeredExtensions.append(extension)
+ return self
+
+ def reset(self):
+ """
+ Resets all state variables so that we can start with a new text.
+ """
+ self.htmlStash.reset()
+ self.references.clear()
+
+ for extension in self.registeredExtensions:
+ if hasattr(extension, 'reset'):
+ extension.reset()
+
+ return self
+
+ def set_output_format(self, format):
+ """ Set the output format for the class instance. """
+ self.output_format = format.lower().rstrip('145') # ignore num
+ try:
+ self.serializer = self.output_formats[self.output_format]
+ except KeyError as e:
+ valid_formats = list(self.output_formats.keys())
+ valid_formats.sort()
+ message = 'Invalid Output Format: "%s". Use one of %s.' \
+ % (self.output_format,
+ '"' + '", "'.join(valid_formats) + '"')
+ e.args = (message,) + e.args[1:]
+ raise
+ return self
+
+ def is_block_level(self, tag):
+ """Check if the tag is a block level HTML tag."""
+ if isinstance(tag, str):
+ return tag.lower().rstrip('/') in self.block_level_elements
+ # Some ElementTree tags are not strings, so return False.
+ return False
+
+ def convert(self, source):
+ """
+ Convert markdown to serialized XHTML or HTML.
+
+ Keyword arguments:
+
+ * source: Source text as a Unicode string.
+
+ Markdown processing takes place in five steps:
+
+ 1. A bunch of "preprocessors" munge the input text.
+ 2. BlockParser() parses the high-level structural elements of the
+ pre-processed text into an ElementTree.
+ 3. A bunch of "treeprocessors" are run against the ElementTree. One
+ such treeprocessor runs InlinePatterns against the ElementTree,
+ detecting inline markup.
+ 4. Some post-processors are run against the text after the ElementTree
+ has been serialized into text.
+ 5. The output is written to a string.
+
+ """
+
+ # Fixup the source text
+ if not source.strip():
+ return '' # a blank unicode string
+
+ try:
+ source = str(source)
+ except UnicodeDecodeError as e: # pragma: no cover
+ # Customise error message while maintaining original trackback
+ e.reason += '. -- Note: Markdown only accepts unicode input!'
+ raise
+
+ # Split into lines and run the line preprocessors.
+ self.lines = source.split("\n")
+ for prep in self.preprocessors:
+ self.lines = prep.run(self.lines)
+
+ # Parse the high-level elements.
+ root = self.parser.parseDocument(self.lines).getroot()
+
+ # Run the tree-processors
+ for treeprocessor in self.treeprocessors:
+ newRoot = treeprocessor.run(root)
+ if newRoot is not None:
+ root = newRoot
+
+ # Serialize _properly_. Strip top-level tags.
+ output = self.serializer(root)
+ if self.stripTopLevelTags:
+ try:
+ start = output.index(
+ '<%s>' % self.doc_tag) + len(self.doc_tag) + 2
+ end = output.rindex('</%s>' % self.doc_tag)
+ output = output[start:end].strip()
+ except ValueError as e: # pragma: no cover
+ if output.strip().endswith('<%s />' % self.doc_tag):
+ # We have an empty document
+ output = ''
+ else:
+ # We have a serious problem
+ raise ValueError('Markdown failed to strip top-level '
+ 'tags. Document=%r' % output.strip()) from e
+
+ # Run the text post-processors
+ for pp in self.postprocessors:
+ output = pp.run(output)
+
+ return output.strip()
+
+ def convertFile(self, input=None, output=None, encoding=None):
+ """Converts a markdown file and returns the HTML as a unicode string.
+
+ Decodes the file using the provided encoding (defaults to utf-8),
+ passes the file content to markdown, and outputs the html to either
+ the provided stream or the file with provided name, using the same
+ encoding as the source file. The 'xmlcharrefreplace' error handler is
+ used when encoding the output.
+
+ **Note:** This is the only place that decoding and encoding of unicode
+ takes place in Python-Markdown. (All other code is unicode-in /
+ unicode-out.)
+
+ Keyword arguments:
+
+ * input: File object or path. Reads from stdin if `None`.
+ * output: File object or path. Writes to stdout if `None`.
+ * encoding: Encoding of input and output files. Defaults to utf-8.
+
+ """
+
+ encoding = encoding or "utf-8"
+
+ # Read the source
+ if input:
+ if isinstance(input, str):
+ input_file = codecs.open(input, mode="r", encoding=encoding)
+ else:
+ input_file = codecs.getreader(encoding)(input)
+ text = input_file.read()
+ input_file.close()
+ else:
+ text = sys.stdin.read()
+ if not isinstance(text, str): # pragma: no cover
+ text = text.decode(encoding)
+
+ text = text.lstrip('\ufeff') # remove the byte-order mark
+
+ # Convert
+ html = self.convert(text)
+
+ # Write to file or stdout
+ if output:
+ if isinstance(output, str):
+ output_file = codecs.open(output, "w",
+ encoding=encoding,
+ errors="xmlcharrefreplace")
+ output_file.write(html)
+ output_file.close()
+ else:
+ writer = codecs.getwriter(encoding)
+ output_file = writer(output, errors="xmlcharrefreplace")
+ output_file.write(html)
+ # Don't close here. User may want to write more.
+ else:
+ # Encode manually and write bytes to stdout.
+ html = html.encode(encoding, "xmlcharrefreplace")
+ try:
+ # Write bytes directly to buffer (Python 3).
+ sys.stdout.buffer.write(html)
+ except AttributeError: # pragma: no cover
+ # Probably Python 2, which works with bytes by default.
+ sys.stdout.write(html)
+
+ return self
+
+
+"""
+EXPORTED FUNCTIONS
+=============================================================================
+
+Those are the two functions we really mean to export: markdown() and
+markdownFromFile().
+"""
+
+
+def markdown(text, **kwargs):
+ """Convert a markdown string to HTML and return HTML as a unicode string.
+
+ This is a shortcut function for `Markdown` class to cover the most
+ basic use case. It initializes an instance of Markdown, loads the
+ necessary extensions and runs the parser on the given text.
+
+ Keyword arguments:
+
+ * text: Markdown formatted text as Unicode or ASCII string.
+ * Any arguments accepted by the Markdown class.
+
+ Returns: An HTML document as a string.
+
+ """
+ md = Markdown(**kwargs)
+ return md.convert(text)
+
+
+def markdownFromFile(**kwargs):
+ """Read markdown code from a file and write it to a file or a stream.
+
+ This is a shortcut function which initializes an instance of Markdown,
+ and calls the convertFile method rather than convert.
+
+ Keyword arguments:
+
+ * input: a file name or readable object.
+ * output: a file name or writable object.
+ * encoding: Encoding of input and output.
+ * Any arguments accepted by the Markdown class.
+
+ """
+ md = Markdown(**kwargs)
+ md.convertFile(kwargs.get('input', None),
+ kwargs.get('output', None),
+ kwargs.get('encoding', None))
diff --git a/markdown/etree_loader.py b/markdown/etree_loader.py
deleted file mode 100644
index e2599b2..0000000
--- a/markdown/etree_loader.py
+++ /dev/null
@@ -1,33 +0,0 @@
-
-from markdown import message, CRITICAL
-import sys
-
-## Import
-def importETree():
- """Import the best implementation of ElementTree, return a module object."""
- etree_in_c = None
- try: # Is it Python 2.5+ with C implemenation of ElementTree installed?
- import xml.etree.cElementTree as etree_in_c
- except ImportError:
- try: # Is it Python 2.5+ with Python implementation of ElementTree?
- import xml.etree.ElementTree as etree
- except ImportError:
- try: # An earlier version of Python with cElementTree installed?
- import cElementTree as etree_in_c
- except ImportError:
- try: # An earlier version of Python with Python ElementTree?
- import elementtree.ElementTree as etree
- except ImportError:
- message(CRITICAL, "Failed to import ElementTree")
- sys.exit(1)
- if etree_in_c and etree_in_c.VERSION < "1.0":
- message(CRITICAL, "For cElementTree version 1.0 or higher is required.")
- sys.exit(1)
- elif etree_in_c :
- return etree_in_c
- elif etree.VERSION < "1.1":
- message(CRITICAL, "For ElementTree version 1.1 or higher is required")
- sys.exit(1)
- else :
- return etree
-
diff --git a/markdown/extensions/__init__.py b/markdown/extensions/__init__.py
index e69de29..2d8d72a 100644
--- a/markdown/extensions/__init__.py
+++ b/markdown/extensions/__init__.py
@@ -0,0 +1,86 @@
+"""
+Python Markdown
+
+A Python implementation of John Gruber's Markdown.
+
+Documentation: https://python-markdown.github.io/
+GitHub: https://github.com/Python-Markdown/markdown/
+PyPI: https://pypi.org/project/Markdown/
+
+Started by Manfred Stienstra (http://www.dwerg.net/).
+Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
+Currently maintained by Waylan Limberg (https://github.com/waylan),
+Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
+
+Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later)
+Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
+Copyright 2004 Manfred Stienstra (the original version)
+
+License: BSD (see LICENSE.md for details).
+"""
+
+from ..util import parseBoolValue
+
+
+class Extension:
+ """ Base class for extensions to subclass. """
+
+ # Default config -- to be overridden by a subclass
+ # Must be of the following format:
+ # {
+ # 'key': ['value', 'description']
+ # }
+ # Note that Extension.setConfig will raise a KeyError
+ # if a default is not set here.
+ config = {}
+
+ def __init__(self, **kwargs):
+ """ Initiate Extension and set up configs. """
+ self.setConfigs(kwargs)
+
+ def getConfig(self, key, default=''):
+ """ Return a setting for the given key or an empty string. """
+ if key in self.config:
+ return self.config[key][0]
+ else:
+ return default
+
+ def getConfigs(self):
+ """ Return all configs settings as a dict. """
+ return {key: self.getConfig(key) for key in self.config.keys()}
+
+ def getConfigInfo(self):
+ """ Return all config descriptions as a list of tuples. """
+ return [(key, self.config[key][1]) for key in self.config.keys()]
+
+ def setConfig(self, key, value):
+ """ Set a config setting for `key` with the given `value`. """
+ if isinstance(self.config[key][0], bool):
+ value = parseBoolValue(value)
+ if self.config[key][0] is None:
+ value = parseBoolValue(value, preserve_none=True)
+ self.config[key][0] = value
+
+ def setConfigs(self, items):
+ """ Set multiple config settings given a dict or list of tuples. """
+ if hasattr(items, 'items'):
+ # it's a dict
+ items = items.items()
+ for key, value in items:
+ self.setConfig(key, value)
+
+ def extendMarkdown(self, md):
+ """
+ Add the various processors and patterns to the Markdown Instance.
+
+ This method must be overridden by every extension.
+
+ Keyword arguments:
+
+ * md: The Markdown instance.
+
+ """
+ raise NotImplementedError(
+ 'Extension "%s.%s" must define an "extendMarkdown"'
+ 'method.' % (self.__class__.__module__, self.__class__.__name__)
+ )
diff --git a/markdown/extensions/abbr.py b/markdown/extensions/abbr.py
index 783220e..9879314 100644
--- a/markdown/extensions/abbr.py
+++ b/markdown/extensions/abbr.py
@@ -4,67 +4,74 @@ Abbreviation Extension for Python-Markdown
This extension adds abbreviation handling to Python-Markdown.
-Simple Usage:
-
- >>> import markdown
- >>> text = """
- ... Some text with an ABBR and a REF. Ignore REFERENCE and ref.
- ...
- ... *[ABBR]: Abbreviation
- ... *[REF]: Abbreviation Reference
- ... """
- >>> markdown.markdown(text, ['abbr'])
- u'<p>Some text with an <abbr title="Abbreviation">ABBR</abbr> and a <abbr title="Abbreviation Reference">REF</abbr>. Ignore REFERENCE and ref.</p>'
-
-Copyright 2007-2008
-* [Waylan Limberg](http://achinghead.com/)
-* [Seemant Kulleen](http://www.kulleen.org/)
-
+See <https://Python-Markdown.github.io/extensions/abbreviations>
+for documentation.
+
+Oringinal code Copyright 2007-2008 [Waylan Limberg](http://achinghead.com/) and
+ [Seemant Kulleen](http://www.kulleen.org/)
+
+All changes Copyright 2008-2014 The Python Markdown Project
+
+License: [BSD](https://opensource.org/licenses/bsd-license.php)
'''
-import markdown, re
-from markdown import etree
+from . import Extension
+from ..blockprocessors import BlockProcessor
+from ..inlinepatterns import InlineProcessor
+from ..util import AtomicString
+import re
+import xml.etree.ElementTree as etree
-# Global Vars
-ABBR_REF_RE = re.compile(r'[*]\[(?P<abbr>[^\]]*)\][ ]?:\s*(?P<title>.*)')
-class AbbrExtension(markdown.Extension):
+class AbbrExtension(Extension):
""" Abbreviation Extension for Python-Markdown. """
- def extendMarkdown(self, md, md_globals):
+ def extendMarkdown(self, md):
""" Insert AbbrPreprocessor before ReferencePreprocessor. """
- md.preprocessors.add('abbr', AbbrPreprocessor(md), '<reference')
-
-
-class AbbrPreprocessor(markdown.preprocessors.Preprocessor):
+ md.parser.blockprocessors.register(AbbrPreprocessor(md.parser), 'abbr', 16)
+
+
+class AbbrPreprocessor(BlockProcessor):
""" Abbreviation Preprocessor - parse text for abbr references. """
- def run(self, lines):
+ RE = re.compile(r'^[*]\[(?P<abbr>[^\]]*)\][ ]?:[ ]*\n?[ ]*(?P<title>.*)$', re.MULTILINE)
+
+ def test(self, parent, block):
+ return True
+
+ def run(self, parent, blocks):
'''
Find and remove all Abbreviation references from the text.
Each reference is set as a new AbbrPattern in the markdown instance.
-
+
'''
- new_text = []
- for line in lines:
- m = ABBR_REF_RE.match(line)
- if m:
- abbr = m.group('abbr').strip()
- title = m.group('title').strip()
- self.markdown.inlinePatterns['abbr-%s'%abbr] = \
- AbbrPattern(self._generate_pattern(abbr), title)
- else:
- new_text.append(line)
- return new_text
-
+ block = blocks.pop(0)
+ m = self.RE.search(block)
+ if m:
+ abbr = m.group('abbr').strip()
+ title = m.group('title').strip()
+ self.parser.md.inlinePatterns.register(
+ AbbrInlineProcessor(self._generate_pattern(abbr), title), 'abbr-%s' % abbr, 2
+ )
+ if block[m.end():].strip():
+ # Add any content after match back to blocks as separate block
+ blocks.insert(0, block[m.end():].lstrip('\n'))
+ if block[:m.start()].strip():
+ # Add any content before match back to blocks as separate block
+ blocks.insert(0, block[:m.start()].rstrip('\n'))
+ return True
+ # No match. Restore block.
+ blocks.insert(0, block)
+ return False
+
def _generate_pattern(self, text):
'''
- Given a string, returns an regex pattern to match that string.
-
- 'HTML' -> r'(?P<abbr>[H][T][M][L])'
-
- Note: we force each char as a literal match (in brackets) as we don't
+ Given a string, returns an regex pattern to match that string.
+
+ 'HTML' -> r'(?P<abbr>[H][T][M][L])'
+
+ Note: we force each char as a literal match (in brackets) as we don't
know what they will be beforehand.
'''
@@ -74,22 +81,19 @@ class AbbrPreprocessor(markdown.preprocessors.Preprocessor):
return r'(?P<abbr>\b%s\b)' % (r''.join(chars))
-class AbbrPattern(markdown.inlinepatterns.Pattern):
+class AbbrInlineProcessor(InlineProcessor):
""" Abbreviation inline pattern. """
def __init__(self, pattern, title):
- markdown.inlinepatterns.Pattern.__init__(self, pattern)
+ super().__init__(pattern)
self.title = title
- def handleMatch(self, m):
+ def handleMatch(self, m, data):
abbr = etree.Element('abbr')
- abbr.text = m.group('abbr')
+ abbr.text = AtomicString(m.group('abbr'))
abbr.set('title', self.title)
- return abbr
+ return abbr, m.start(0), m.end(0)
-def makeExtension(configs=None):
- return AbbrExtension(configs=configs)
-if __name__ == "__main__":
- import doctest
- doctest.testmod()
+def makeExtension(**kwargs): # pragma: no cover
+ return AbbrExtension(**kwargs)
diff --git a/markdown/extensions/admonition.py b/markdown/extensions/admonition.py
new file mode 100644
index 0000000..cb8d901
--- /dev/null
+++ b/markdown/extensions/admonition.py
@@ -0,0 +1,170 @@
+"""
+Admonition extension for Python-Markdown
+========================================
+
+Adds rST-style admonitions. Inspired by [rST][] feature with the same name.
+
+[rST]: http://docutils.sourceforge.net/docs/ref/rst/directives.html#specific-admonitions # noqa
+
+See <https://Python-Markdown.github.io/extensions/admonition>
+for documentation.
+
+Original code Copyright [Tiago Serafim](https://www.tiagoserafim.com/).
+
+All changes Copyright The Python Markdown Project
+
+License: [BSD](https://opensource.org/licenses/bsd-license.php)
+
+"""
+
+from . import Extension
+from ..blockprocessors import BlockProcessor
+import xml.etree.ElementTree as etree
+import re
+
+
+class AdmonitionExtension(Extension):
+ """ Admonition extension for Python-Markdown. """
+
+ def extendMarkdown(self, md):
+ """ Add Admonition to Markdown instance. """
+ md.registerExtension(self)
+
+ md.parser.blockprocessors.register(AdmonitionProcessor(md.parser), 'admonition', 105)
+
+
+class AdmonitionProcessor(BlockProcessor):
+
+ CLASSNAME = 'admonition'
+ CLASSNAME_TITLE = 'admonition-title'
+ RE = re.compile(r'(?:^|\n)!!! ?([\w\-]+(?: +[\w\-]+)*)(?: +"(.*?)")? *(?:\n|$)')
+ RE_SPACES = re.compile(' +')
+
+ def __init__(self, parser):
+ """Initialization."""
+
+ super().__init__(parser)
+
+ self.current_sibling = None
+ self.content_indention = 0
+
+ def parse_content(self, parent, block):
+ """Get sibling admonition.
+
+ Retrieve the appropriate sibling element. This can get tricky when
+ dealing with lists.
+
+ """
+
+ old_block = block
+ the_rest = ''
+
+ # We already acquired the block via test
+ if self.current_sibling is not None:
+ sibling = self.current_sibling
+ block, the_rest = self.detab(block, self.content_indent)
+ self.current_sibling = None
+ self.content_indent = 0
+ return sibling, block, the_rest
+
+ sibling = self.lastChild(parent)
+
+ if sibling is None or sibling.get('class', '').find(self.CLASSNAME) == -1:
+ sibling = None
+ else:
+ # If the last child is a list and the content is sufficiently indented
+ # to be under it, then the content's sibling is in the list.
+ last_child = self.lastChild(sibling)
+ indent = 0
+ while last_child:
+ if (
+ sibling and block.startswith(' ' * self.tab_length * 2) and
+ last_child and last_child.tag in ('ul', 'ol', 'dl')
+ ):
+
+ # The expectation is that we'll find an <li> or <dt>.
+ # We should get its last child as well.
+ sibling = self.lastChild(last_child)
+ last_child = self.lastChild(sibling) if sibling else None
+
+ # Context has been lost at this point, so we must adjust the
+ # text's indentation level so it will be evaluated correctly
+ # under the list.
+ block = block[self.tab_length:]
+ indent += self.tab_length
+ else:
+ last_child = None
+
+ if not block.startswith(' ' * self.tab_length):
+ sibling = None
+
+ if sibling is not None:
+ indent += self.tab_length
+ block, the_rest = self.detab(old_block, indent)
+ self.current_sibling = sibling
+ self.content_indent = indent
+
+ return sibling, block, the_rest
+
+ def test(self, parent, block):
+
+ if self.RE.search(block):
+ return True
+ else:
+ return self.parse_content(parent, block)[0] is not None
+
+ def run(self, parent, blocks):
+ block = blocks.pop(0)
+ m = self.RE.search(block)
+
+ if m:
+ if m.start() > 0:
+ self.parser.parseBlocks(parent, [block[:m.start()]])
+ block = block[m.end():] # removes the first line
+ block, theRest = self.detab(block)
+ else:
+ sibling, block, theRest = self.parse_content(parent, block)
+
+ if m:
+ klass, title = self.get_class_and_title(m)
+ div = etree.SubElement(parent, 'div')
+ div.set('class', '{} {}'.format(self.CLASSNAME, klass))
+ if title:
+ p = etree.SubElement(div, 'p')
+ p.text = title
+ p.set('class', self.CLASSNAME_TITLE)
+ else:
+ # Sibling is a list item, but we need to wrap it's content should be wrapped in <p>
+ if sibling.tag in ('li', 'dd') and sibling.text:
+ text = sibling.text
+ sibling.text = ''
+ p = etree.SubElement(sibling, 'p')
+ p.text = text
+
+ div = sibling
+
+ self.parser.parseChunk(div, block)
+
+ if theRest:
+ # This block contained unindented line(s) after the first indented
+ # line. Insert these lines as the first block of the master blocks
+ # list for future processing.
+ blocks.insert(0, theRest)
+
+ def get_class_and_title(self, match):
+ klass, title = match.group(1).lower(), match.group(2)
+ klass = self.RE_SPACES.sub(' ', klass)
+ if title is None:
+ # no title was provided, use the capitalized classname as title
+ # e.g.: `!!! note` will render
+ # `<p class="admonition-title">Note</p>`
+ title = klass.split(' ', 1)[0].capitalize()
+ elif title == '':
+ # an explicit blank title should not be rendered
+ # e.g.: `!!! warning ""` will *not* render `p` with a title
+ title = None
+ return klass, title
+
+
+def makeExtension(**kwargs): # pragma: no cover
+ return AdmonitionExtension(**kwargs)
diff --git a/markdown/extensions/attr_list.py b/markdown/extensions/attr_list.py
new file mode 100644
index 0000000..9a67551
--- /dev/null
+++ b/markdown/extensions/attr_list.py
@@ -0,0 +1,166 @@
+"""
+Attribute List Extension for Python-Markdown
+============================================
+
+Adds attribute list syntax. Inspired by
+[maruku](http://maruku.rubyforge.org/proposal.html#attribute_lists)'s
+feature of the same name.
+
+See <https://Python-Markdown.github.io/extensions/attr_list>
+for documentation.
+
+Original code Copyright 2011 [Waylan Limberg](http://achinghead.com/).
+
+All changes Copyright 2011-2014 The Python Markdown Project
+
+License: [BSD](https://opensource.org/licenses/bsd-license.php)
+
+"""
+
+from . import Extension
+from ..treeprocessors import Treeprocessor
+import re
+
+
+def _handle_double_quote(s, t):
+ k, v = t.split('=', 1)
+ return k, v.strip('"')
+
+
+def _handle_single_quote(s, t):
+ k, v = t.split('=', 1)
+ return k, v.strip("'")
+
+
+def _handle_key_value(s, t):
+ return t.split('=', 1)
+
+
+def _handle_word(s, t):
+ if t.startswith('.'):
+ return '.', t[1:]
+ if t.startswith('#'):
+ return 'id', t[1:]
+ return t, t
+
+
+_scanner = re.Scanner([
+ (r'[^ =]+=".*?"', _handle_double_quote),
+ (r"[^ =]+='.*?'", _handle_single_quote),
+ (r'[^ =]+=[^ =]+', _handle_key_value),
+ (r'[^ =]+', _handle_word),
+ (r' ', None)
+])
+
+
+def get_attrs(str):
+ """ Parse attribute list and return a list of attribute tuples. """
+ return _scanner.scan(str)[0]
+
+
+def isheader(elem):
+ return elem.tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
+
+
+class AttrListTreeprocessor(Treeprocessor):
+
+ BASE_RE = r'\{\:?[ ]*([^\}\n ][^\}\n]*)[ ]*\}'
+ HEADER_RE = re.compile(r'[ ]+{}[ ]*$'.format(BASE_RE))
+ BLOCK_RE = re.compile(r'\n[ ]*{}[ ]*$'.format(BASE_RE))
+ INLINE_RE = re.compile(r'^{}'.format(BASE_RE))
+ NAME_RE = re.compile(r'[^A-Z_a-z\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u02ff'
+ r'\u0370-\u037d\u037f-\u1fff\u200c-\u200d'
+ r'\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff'
+ r'\uf900-\ufdcf\ufdf0-\ufffd'
+ r'\:\-\.0-9\u00b7\u0300-\u036f\u203f-\u2040]+')
+
+ def run(self, doc):
+ for elem in doc.iter():
+ if self.md.is_block_level(elem.tag):
+ # Block level: check for attrs on last line of text
+ RE = self.BLOCK_RE
+ if isheader(elem) or elem.tag in ['dt', 'td', 'th']:
+ # header, def-term, or table cell: check for attrs at end of element
+ RE = self.HEADER_RE
+ if len(elem) and elem.tag == 'li':
+ # special case list items. children may include a ul or ol.
+ pos = None
+ # find the ul or ol position
+ for i, child in enumerate(elem):
+ if child.tag in ['ul', 'ol']:
+ pos = i
+ break
+ if pos is None and elem[-1].tail:
+ # use tail of last child. no ul or ol.
+ m = RE.search(elem[-1].tail)
+ if m:
+ self.assign_attrs(elem, m.group(1))
+ elem[-1].tail = elem[-1].tail[:m.start()]
+ elif pos is not None and pos > 0 and elem[pos-1].tail:
+ # use tail of last child before ul or ol
+ m = RE.search(elem[pos-1].tail)
+ if m:
+ self.assign_attrs(elem, m.group(1))
+ elem[pos-1].tail = elem[pos-1].tail[:m.start()]
+ elif elem.text:
+ # use text. ul is first child.
+ m = RE.search(elem.text)
+ if m:
+ self.assign_attrs(elem, m.group(1))
+ elem.text = elem.text[:m.start()]
+ elif len(elem) and elem[-1].tail:
+ # has children. Get from tail of last child
+ m = RE.search(elem[-1].tail)
+ if m:
+ self.assign_attrs(elem, m.group(1))
+ elem[-1].tail = elem[-1].tail[:m.start()]
+ if isheader(elem):
+ # clean up trailing #s
+ elem[-1].tail = elem[-1].tail.rstrip('#').rstrip()
+ elif elem.text:
+ # no children. Get from text.
+ m = RE.search(elem.text)
+ if m:
+ self.assign_attrs(elem, m.group(1))
+ elem.text = elem.text[:m.start()]
+ if isheader(elem):
+ # clean up trailing #s
+ elem.text = elem.text.rstrip('#').rstrip()
+ else:
+ # inline: check for attrs at start of tail
+ if elem.tail:
+ m = self.INLINE_RE.match(elem.tail)
+ if m:
+ self.assign_attrs(elem, m.group(1))
+ elem.tail = elem.tail[m.end():]
+
+ def assign_attrs(self, elem, attrs):
+ """ Assign attrs to element. """
+ for k, v in get_attrs(attrs):
+ if k == '.':
+ # add to class
+ cls = elem.get('class')
+ if cls:
+ elem.set('class', '{} {}'.format(cls, v))
+ else:
+ elem.set('class', v)
+ else:
+ # assign attr k with v
+ elem.set(self.sanitize_name(k), v)
+
+ def sanitize_name(self, name):
+ """
+ Sanitize name as 'an XML Name, minus the ":"'.
+ See https://www.w3.org/TR/REC-xml-names/#NT-NCName
+ """
+ return self.NAME_RE.sub('_', name)
+
+
+class AttrListExtension(Extension):
+ def extendMarkdown(self, md):
+ md.treeprocessors.register(AttrListTreeprocessor(md), 'attr_list', 8)
+ md.registerExtension(self)
+
+
+def makeExtension(**kwargs): # pragma: no cover
+ return AttrListExtension(**kwargs)
diff --git a/markdown/extensions/codehilite.py b/markdown/extensions/codehilite.py
index c5d496b..a54ba21 100644
--- a/markdown/extensions/codehilite.py
+++ b/markdown/extensions/codehilite.py
@@ -1,156 +1,216 @@
-#!/usr/bin/python
-
"""
CodeHilite Extension for Python-Markdown
========================================
Adds code/syntax highlighting to standard Python-Markdown code blocks.
-Copyright 2006-2008 [Waylan Limberg](http://achinghead.com/).
+See <https://Python-Markdown.github.io/extensions/code_hilite>
+for documentation.
+
+Original code Copyright 2006-2008 [Waylan Limberg](http://achinghead.com/).
+
+All changes Copyright 2008-2014 The Python Markdown Project
-Project website: <http://www.freewisdom.org/project/python-markdown/CodeHilite>
-Contact: markdown@freewisdom.org
-
-License: BSD (see ../docs/LICENSE for details)
-
-Dependencies:
-* [Python 2.3+](http://python.org/)
-* [Markdown 2.0+](http://www.freewisdom.org/projects/python-markdown/)
-* [Pygments](http://pygments.org/)
+License: [BSD](https://opensource.org/licenses/bsd-license.php)
"""
-import markdown
+from . import Extension
+from ..treeprocessors import Treeprocessor
+from ..util import parseBoolValue
+
+try: # pragma: no cover
+ from pygments import highlight
+ from pygments.lexers import get_lexer_by_name, guess_lexer
+ from pygments.formatters import get_formatter_by_name
+ from pygments.util import ClassNotFound
+ pygments = True
+except ImportError: # pragma: no cover
+ pygments = False
+
-# --------------- CONSTANTS YOU MIGHT WANT TO MODIFY -----------------
+def parse_hl_lines(expr):
+ """Support our syntax for emphasizing certain lines of code.
-try:
- TAB_LENGTH = markdown.TAB_LENGTH
-except AttributeError:
- TAB_LENGTH = 4
+ expr should be like '1 2' to emphasize lines 1 and 2 of a code block.
+ Returns a list of ints, the line numbers to emphasize.
+ """
+ if not expr:
+ return []
+
+ try:
+ return list(map(int, expr.split()))
+ except ValueError: # pragma: no cover
+ return []
# ------------------ The Main CodeHilite Class ----------------------
class CodeHilite:
"""
- Determine language of source code, and pass it into the pygments hilighter.
+ Determine language of source code, and pass it on to the Pygments highlighter.
+
+ Usage:
+ code = CodeHilite(src=some_code, lang='python')
+ html = code.hilite()
- Basic Usage:
- >>> code = CodeHilite(src = 'some text')
- >>> html = code.hilite()
-
+ Arguments:
* src: Source string or any object with a .readline attribute.
-
- * linenos: (Boolen) Turn line numbering 'on' or 'off' (off by default).
-
- * css_class: Set class name of wrapper div ('codehilite' by default).
-
- Low Level Usage:
- >>> code = CodeHilite()
- >>> code.src = 'some text' # String or anything with a .readline attr.
- >>> code.linenos = True # True or False; Turns line numbering on or of.
- >>> html = code.hilite()
-
+
+ * lang: String name of Pygments lexer to use for highlighting. Default: `None`.
+
+ * guess_lang: Auto-detect which lexer to use. Ignored if `lang` is set to a valid
+ value. Default: `True`.
+
+ * use_pygments: Pass code to pygments for code highlighting. If `False`, the code is
+ instead wrapped for highlighting by a JavaScript library. Default: `True`.
+
+ * pygments_formatter: The name of a Pygments formatter or a formatter class used for
+ highlighting the code blocks. Default: `html`.
+
+ * linenums: An alias to Pygments `linenos` formatter option. Default: `None`.
+
+ * css_class: An alias to Pygments `cssclass` formatter option. Default: 'codehilite'.
+
+ * lang_prefix: Prefix prepended to the language. Default: "language-".
+
+ Other Options:
+ Any other options are accepted and passed on to the lexer and formatter. Therefore,
+ valid options include any options which are accepted by the `html` formatter or
+ whichever lexer the code's language uses. Note that most lexers do not have any
+ options. However, a few have very useful options, such as PHP's `startinline` option.
+ Any invalid options are ignored without error.
+
+ Formatter options: https://pygments.org/docs/formatters/#HtmlFormatter
+ Lexer Options: https://pygments.org/docs/lexers/
+
+ Additionally, when Pygments is enabled, the code's language is passed to the
+ formatter as an extra option `lang_str`, whose value being `{lang_prefix}{lang}`.
+ This option has no effect to the Pygments's builtin formatters.
+
+ Advanced Usage:
+ code = CodeHilite(
+ src = some_code,
+ lang = 'php',
+ startinline = True, # Lexer option. Snippet does not start with `<?php`.
+ linenostart = 42, # Formatter option. Snippet starts on line 42.
+ hl_lines = [45, 49, 50], # Formatter option. Highlight lines 45, 49, and 50.
+ linenos = 'inline' # Formatter option. Avoid alignment problems.
+ )
+ html = code.hilite()
+
"""
- def __init__(self, src=None, linenos=False, css_class="codehilite"):
+ def __init__(self, src, **options):
self.src = src
- self.lang = None
- self.linenos = linenos
- self.css_class = css_class
-
- def hilite(self):
+ self.lang = options.pop('lang', None)
+ self.guess_lang = options.pop('guess_lang', True)
+ self.use_pygments = options.pop('use_pygments', True)
+ self.lang_prefix = options.pop('lang_prefix', 'language-')
+ self.pygments_formatter = options.pop('pygments_formatter', 'html')
+
+ if 'linenos' not in options:
+ options['linenos'] = options.pop('linenums', None)
+ if 'cssclass' not in options:
+ options['cssclass'] = options.pop('css_class', 'codehilite')
+ if 'wrapcode' not in options:
+ # Override pygments default
+ options['wrapcode'] = True
+ # Disallow use of `full` option
+ options['full'] = False
+
+ self.options = options
+
+ def hilite(self, shebang=True):
"""
- Pass code to the [Pygments](http://pygments.pocoo.org/) highliter with
- optional line numbers. The output should then be styled with css to
- your liking. No styles are applied by default - only styling hooks
- (i.e.: <span class="k">).
+ Pass code to the [Pygments](http://pygments.pocoo.org/) highliter with
+ optional line numbers. The output should then be styled with css to
+ your liking. No styles are applied by default - only styling hooks
+ (i.e.: <span class="k">).
returns : A string of html.
-
+
"""
self.src = self.src.strip('\n')
-
- self._getLang()
-
- try:
- from pygments import highlight
- from pygments.lexers import get_lexer_by_name, guess_lexer, \
- TextLexer
- from pygments.formatters import HtmlFormatter
- except ImportError:
- # just escape and pass through
- txt = self._escape(self.src)
- if self.linenos:
- txt = self._number(txt)
- else :
- txt = '<div class="%s"><pre>%s</pre></div>\n'% \
- (self.css_class, txt)
- return txt
- else:
+
+ if self.lang is None and shebang:
+ self._parseHeader()
+
+ if pygments and self.use_pygments:
try:
- lexer = get_lexer_by_name(self.lang)
+ lexer = get_lexer_by_name(self.lang, **self.options)
except ValueError:
try:
- lexer = guess_lexer(self.src)
- except ValueError:
- lexer = TextLexer()
- formatter = HtmlFormatter(linenos=self.linenos,
- cssclass=self.css_class)
+ if self.guess_lang:
+ lexer = guess_lexer(self.src, **self.options)
+ else:
+ lexer = get_lexer_by_name('text', **self.options)
+ except ValueError: # pragma: no cover
+ lexer = get_lexer_by_name('text', **self.options)
+ if not self.lang:
+ # Use the guessed lexer's language instead
+ self.lang = lexer.aliases[0]
+ lang_str = f'{self.lang_prefix}{self.lang}'
+ if isinstance(self.pygments_formatter, str):
+ try:
+ formatter = get_formatter_by_name(self.pygments_formatter, **self.options)
+ except ClassNotFound:
+ formatter = get_formatter_by_name('html', **self.options)
+ else:
+ formatter = self.pygments_formatter(lang_str=lang_str, **self.options)
return highlight(self.src, lexer, formatter)
-
- def _escape(self, txt):
- """ basic html escaping """
- txt = txt.replace('&', '&amp;')
- txt = txt.replace('<', '&lt;')
- txt = txt.replace('>', '&gt;')
- txt = txt.replace('"', '&quot;')
- return txt
-
- def _number(self, txt):
- """ Use <ol> for line numbering """
- # Fix Whitespace
- txt = txt.replace('\t', ' '*TAB_LENGTH)
- txt = txt.replace(" "*4, "&nbsp; &nbsp; ")
- txt = txt.replace(" "*3, "&nbsp; &nbsp;")
- txt = txt.replace(" "*2, "&nbsp; ")
-
- # Add line numbers
- lines = txt.splitlines()
- txt = '<div class="codehilite"><pre><ol>\n'
- for line in lines:
- txt += '\t<li>%s</li>\n'% line
- txt += '</ol></pre></div>\n'
- return txt
-
-
- def _getLang(self):
- """
- Determines language of a code block from shebang lines and whether said
- line should be removed or left in place. If the sheband line contains a
- path (even a single /) then it is assumed to be a real shebang lines and
- left alone. However, if no path is given (e.i.: #!python or :::python)
- then it is assumed to be a mock shebang for language identifitation of a
- code fragment and removed from the code block prior to processing for
- code highlighting. When a mock shebang (e.i: #!python) is found, line
- numbering is turned on. When colons are found in place of a shebang
- (e.i.: :::python), line numbering is left in the current state - off
- by default.
-
+ else:
+ # just escape and build markup usable by JS highlighting libs
+ txt = self.src.replace('&', '&amp;')
+ txt = txt.replace('<', '&lt;')
+ txt = txt.replace('>', '&gt;')
+ txt = txt.replace('"', '&quot;')
+ classes = []
+ if self.lang:
+ classes.append('{}{}'.format(self.lang_prefix, self.lang))
+ if self.options['linenos']:
+ classes.append('linenums')
+ class_str = ''
+ if classes:
+ class_str = ' class="{}"'.format(' '.join(classes))
+ return '<pre class="{}"><code{}>{}\n</code></pre>\n'.format(
+ self.options['cssclass'],
+ class_str,
+ txt
+ )
+
+ def _parseHeader(self):
+ """
+ Determines language of a code block from shebang line and whether the
+ said line should be removed or left in place. If the sheband line
+ contains a path (even a single /) then it is assumed to be a real
+ shebang line and left alone. However, if no path is given
+ (e.i.: #!python or :::python) then it is assumed to be a mock shebang
+ for language identification of a code fragment and removed from the
+ code block prior to processing for code highlighting. When a mock
+ shebang (e.i: #!python) is found, line numbering is turned on. When
+ colons are found in place of a shebang (e.i.: :::python), line
+ numbering is left in the current state - off by default.
+
+ Also parses optional list of highlight lines, like:
+
+ :::python hl_lines="1 3"
"""
import re
-
- #split text into lines
+
+ # split text into lines
lines = self.src.split("\n")
- #pull first line to examine
+ # pull first line to examine
fl = lines.pop(0)
-
+
c = re.compile(r'''
- (?:(?:::+)|(?P<shebang>[#]!)) # Shebang or 2 or more colons.
- (?P<path>(?:/\w+)*[/ ])? # Zero or 1 path
- (?P<lang>[\w+-]*) # The language
+ (?:(?:^::+)|(?P<shebang>^[#]!)) # Shebang or 2 or more colons
+ (?P<path>(?:/\w+)*[/ ])? # Zero or 1 path
+ (?P<lang>[\w#.+-]*) # The language
+ \s* # Arbitrary whitespace
+ # Optional highlight lines, single- or double-quote-delimited
+ (hl_lines=(?P<quot>"|')(?P<hl_lines>.*?)(?P=quot))?
''', re.VERBOSE)
# search first line for shebang
m = c.search(fl)
@@ -158,67 +218,113 @@ class CodeHilite:
# we have a match
try:
self.lang = m.group('lang').lower()
- except IndexError:
+ except IndexError: # pragma: no cover
self.lang = None
if m.group('path'):
# path exists - restore first line
lines.insert(0, fl)
- if m.group('shebang'):
- # shebang exists - use line numbers
- self.linenos = True
+ if self.options['linenos'] is None and m.group('shebang'):
+ # Overridable and Shebang exists - use line numbers
+ self.options['linenos'] = True
+
+ self.options['hl_lines'] = parse_hl_lines(m.group('hl_lines'))
else:
# No match
lines.insert(0, fl)
-
- self.src = "\n".join(lines).strip("\n")
+ self.src = "\n".join(lines).strip("\n")
# ------------------ The Markdown Extension -------------------------------
-class HiliteTreeprocessor(markdown.treeprocessors.Treeprocessor):
- """ Hilight source code in code blocks. """
+
+
+class HiliteTreeprocessor(Treeprocessor):
+ """ Highlight source code in code blocks. """
+
+ def code_unescape(self, text):
+ """Unescape code."""
+ text = text.replace("&lt;", "<")
+ text = text.replace("&gt;", ">")
+ # Escaped '&' should be replaced at the end to avoid
+ # conflicting with < and >.
+ text = text.replace("&amp;", "&")
+ return text
def run(self, root):
""" Find code blocks and store in htmlStash. """
- blocks = root.getiterator('pre')
+ blocks = root.iter('pre')
for block in blocks:
- children = block.getchildren()
- if len(children) == 1 and children[0].tag == 'code':
- code = CodeHilite(children[0].text,
- linenos=self.config['force_linenos'][0],
- css_class=self.config['css_class'][0])
- placeholder = self.markdown.htmlStash.store(code.hilite(),
- safe=True)
+ if len(block) == 1 and block[0].tag == 'code':
+ local_config = self.config.copy()
+ code = CodeHilite(
+ self.code_unescape(block[0].text),
+ tab_length=self.md.tab_length,
+ style=local_config.pop('pygments_style', 'default'),
+ **local_config
+ )
+ placeholder = self.md.htmlStash.store(code.hilite())
# Clear codeblock in etree instance
block.clear()
- # Change to p element which will later
+ # Change to p element which will later
# be removed when inserting raw html
block.tag = 'p'
block.text = placeholder
-class CodeHiliteExtension(markdown.Extension):
- """ Add source code hilighting to markdown codeblocks. """
+class CodeHiliteExtension(Extension):
+ """ Add source code highlighting to markdown codeblocks. """
- def __init__(self, configs):
+ def __init__(self, **kwargs):
# define default configs
self.config = {
- 'force_linenos' : [False, "Force line numbers - Default: False"],
- 'css_class' : ["codehilite",
- "Set class name for wrapper <div> - Default: codehilite"],
+ 'linenums': [None,
+ "Use lines numbers. True|table|inline=yes, False=no, None=auto"],
+ 'guess_lang': [True,
+ "Automatic language detection - Default: True"],
+ 'css_class': ["codehilite",
+ "Set class name for wrapper <div> - "
+ "Default: codehilite"],
+ 'pygments_style': ['default',
+ 'Pygments HTML Formatter Style '
+ '(Colorscheme) - Default: default'],
+ 'noclasses': [False,
+ 'Use inline styles instead of CSS classes - '
+ 'Default false'],
+ 'use_pygments': [True,
+ 'Use Pygments to Highlight code blocks. '
+ 'Disable if using a JavaScript library. '
+ 'Default: True'],
+ 'lang_prefix': [
+ 'language-',
+ 'Prefix prepended to the language when use_pygments is false. Default: "language-"'
+ ],
+ 'pygments_formatter': ['html',
+ 'Use a specific formatter for Pygments highlighting.'
+ 'Default: "html"',
+ ],
}
-
- # Override defaults with user settings
- for key, value in configs:
- self.setConfig(key, value)
- def extendMarkdown(self, md, md_globals):
+ for key, value in kwargs.items():
+ if key in self.config:
+ self.setConfig(key, value)
+ else:
+ # manually set unknown keywords.
+ if isinstance(value, str):
+ try:
+ # Attempt to parse str as a bool value
+ value = parseBoolValue(value, preserve_none=True)
+ except ValueError:
+ pass # Assume it's not a bool value. Use as-is.
+ self.config[key] = [value, '']
+
+ def extendMarkdown(self, md):
""" Add HilitePostprocessor to Markdown instance. """
hiliter = HiliteTreeprocessor(md)
- hiliter.config = self.config
- md.treeprocessors.add("hilite", hiliter, "_begin")
+ hiliter.config = self.getConfigs()
+ md.treeprocessors.register(hiliter, 'hilite', 30)
+ md.registerExtension(self)
-def makeExtension(configs={}):
- return CodeHiliteExtension(configs=configs)
+def makeExtension(**kwargs): # pragma: no cover
+ return CodeHiliteExtension(**kwargs)
diff --git a/markdown/extensions/def_list.py b/markdown/extensions/def_list.py
index 73a1c85..17549f0 100644
--- a/markdown/extensions/def_list.py
+++ b/markdown/extensions/def_list.py
@@ -1,61 +1,71 @@
-#!/usr/bin/env Python
"""
Definition List Extension for Python-Markdown
=============================================
-Added parsing of Definition Lists to Python-Markdown.
+Adds parsing of Definition Lists to Python-Markdown.
-A simple example:
+See <https://Python-Markdown.github.io/extensions/definition_lists>
+for documentation.
- Apple
- : Pomaceous fruit of plants of the genus Malus in
- the family Rosaceae.
- : An american computer company.
+Original code Copyright 2008 [Waylan Limberg](http://achinghead.com)
- Orange
- : The fruit of an evergreen tree of the genus Citrus.
+All changes Copyright 2008-2014 The Python Markdown Project
-Copyright 2008 - [Waylan Limberg](http://achinghead.com)
+License: [BSD](https://opensource.org/licenses/bsd-license.php)
"""
-import markdown, re
-from markdown import etree
+from . import Extension
+from ..blockprocessors import BlockProcessor, ListIndentProcessor
+import xml.etree.ElementTree as etree
+import re
-class DefListProcessor(markdown.blockprocessors.BlockProcessor):
+class DefListProcessor(BlockProcessor):
""" Process Definition Lists. """
RE = re.compile(r'(^|\n)[ ]{0,3}:[ ]{1,3}(.*?)(\n|$)')
+ NO_INDENT_RE = re.compile(r'^[ ]{0,3}[^ :]')
def test(self, parent, block):
return bool(self.RE.search(block))
def run(self, parent, blocks):
- block = blocks.pop(0)
- m = self.RE.search(block)
- terms = [l.strip() for l in block[:m.start()].split('\n') if l.strip()]
- d, theRest = self.detab(block[m.end():])
+
+ raw_block = blocks.pop(0)
+ m = self.RE.search(raw_block)
+ terms = [term.strip() for term in
+ raw_block[:m.start()].split('\n') if term.strip()]
+ block = raw_block[m.end():]
+ no_indent = self.NO_INDENT_RE.match(block)
+ if no_indent:
+ d, theRest = (block, None)
+ else:
+ d, theRest = self.detab(block)
if d:
- d = '%s\n%s' % (m.group(2), d)
+ d = '{}\n{}'.format(m.group(2), d)
else:
d = m.group(2)
- #import ipdb; ipdb.set_trace()
sibling = self.lastChild(parent)
+ if not terms and sibling is None:
+ # This is not a definition item. Most likely a paragraph that
+ # starts with a colon at the beginning of a document or list.
+ blocks.insert(0, raw_block)
+ return False
if not terms and sibling.tag == 'p':
# The previous paragraph contains the terms
state = 'looselist'
terms = sibling.text.split('\n')
parent.remove(sibling)
- # Aquire new sibling
+ # Acquire new sibling
sibling = self.lastChild(parent)
else:
state = 'list'
- if sibling and sibling.tag == 'dl':
+ if sibling is not None and sibling.tag == 'dl':
# This is another item on an existing list
dl = sibling
- if len(dl) and dl[-1].tag == 'dd' and len(dl[-1]):
+ if not terms and len(dl) and dl[-1].tag == 'dd' and len(dl[-1]):
state = 'looselist'
else:
# This is a new list
@@ -73,32 +83,29 @@ class DefListProcessor(markdown.blockprocessors.BlockProcessor):
if theRest:
blocks.insert(0, theRest)
-class DefListIndentProcessor(markdown.blockprocessors.ListIndentProcessor):
+
+class DefListIndentProcessor(ListIndentProcessor):
""" Process indented children of definition list items. """
- ITEM_TYPES = ['dd']
- LIST_TYPES = ['dl']
+ # Definition lists need to be aware of all list types
+ ITEM_TYPES = ['dd', 'li']
+ LIST_TYPES = ['dl', 'ol', 'ul']
- def create_item(parent, block):
- """ Create a new dd and parse the block with it as the parent. """
- dd = markdown.etree.SubElement(parent, 'dd')
+ def create_item(self, parent, block):
+ """ Create a new dd or li (depending on parent) and parse the block with it as the parent. """
+
+ dd = etree.SubElement(parent, 'dd')
self.parser.parseBlocks(dd, [block])
-
-class DefListExtension(markdown.Extension):
+class DefListExtension(Extension):
""" Add definition lists to Markdown. """
- def extendMarkdown(self, md, md_globals):
+ def extendMarkdown(self, md):
""" Add an instance of DefListProcessor to BlockParser. """
- md.parser.blockprocessors.add('defindent',
- DefListIndentProcessor(md.parser),
- '>indent')
- md.parser.blockprocessors.add('deflist',
- DefListProcessor(md.parser),
- '>ulist')
-
+ md.parser.blockprocessors.register(DefListIndentProcessor(md.parser), 'defindent', 85)
+ md.parser.blockprocessors.register(DefListProcessor(md.parser), 'deflist', 25)
-def makeExtension(configs={}):
- return DefListExtension(configs=configs)
+def makeExtension(**kwargs): # pragma: no cover
+ return DefListExtension(**kwargs)
diff --git a/markdown/extensions/extra.py b/markdown/extensions/extra.py
index 4a2ffbf..909ba07 100644
--- a/markdown/extensions/extra.py
+++ b/markdown/extensions/extra.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
"""
Python-Markdown Extra Extension
===============================
@@ -7,43 +6,53 @@ A compilation of various Python-Markdown extensions that imitates
[PHP Markdown Extra](http://michelf.com/projects/php-markdown/extra/).
Note that each of the individual extensions still need to be available
-on your PYTHONPATH. This extension simply wraps them all up as a
+on your PYTHONPATH. This extension simply wraps them all up as a
convenience so that only one extension needs to be listed when
initiating Markdown. See the documentation for each individual
extension for specifics about that extension.
-In the event that one or more of the supported extensions are not
-available for import, Markdown will issue a warning and simply continue
-without that extension.
-
-There may be additional extensions that are distributed with
+There may be additional extensions that are distributed with
Python-Markdown that are not included here in Extra. Those extensions
are not part of PHP Markdown Extra, and therefore, not part of
Python-Markdown Extra. If you really would like Extra to include
additional extensions, we suggest creating your own clone of Extra
-under a differant name. You could also edit the `extensions` global
-variable defined below, but be aware that such changes may be lost
+under a different name. You could also edit the `extensions` global
+variable defined below, but be aware that such changes may be lost
when you upgrade to any future version of Python-Markdown.
+See <https://Python-Markdown.github.io/extensions/extra>
+for documentation.
+
+Copyright The Python Markdown Project
+
+License: [BSD](https://opensource.org/licenses/bsd-license.php)
+
"""
-import markdown
+from . import Extension
-extensions = ['fenced_code',
- 'footnotes',
- 'headerid',
- 'def_list',
- 'tables',
- 'abbr',
- ]
-
+extensions = [
+ 'fenced_code',
+ 'footnotes',
+ 'attr_list',
+ 'def_list',
+ 'tables',
+ 'abbr',
+ 'md_in_html'
+]
-class ExtraExtension(markdown.Extension):
+
+class ExtraExtension(Extension):
""" Add various extensions to Markdown class."""
- def extendMarkdown(self, md, md_globals):
+ def __init__(self, **kwargs):
+ """ config is a dumb holder which gets passed to actual ext later. """
+ self.config = kwargs
+
+ def extendMarkdown(self, md):
""" Register extension instances. """
md.registerExtensions(extensions, self.config)
-def makeExtension(configs={}):
- return ExtraExtension(configs=dict(configs))
+
+def makeExtension(**kwargs): # pragma: no cover
+ return ExtraExtension(**kwargs)
diff --git a/markdown/extensions/fenced_code.py b/markdown/extensions/fenced_code.py
index 307b1dc..409166a 100644
--- a/markdown/extensions/fenced_code.py
+++ b/markdown/extensions/fenced_code.py
@@ -1,104 +1,166 @@
-#!/usr/bin/env python
-
"""
Fenced Code Extension for Python Markdown
=========================================
This extension adds Fenced Code Blocks to Python-Markdown.
- >>> import markdown
- >>> text = '''
- ... A paragraph before a fenced code block:
- ...
- ... ~~~
- ... Fenced code block
- ... ~~~
- ... '''
- >>> html = markdown.markdown(text, extensions=['fenced_code'])
- >>> html
- u'<p>A paragraph before a fenced code block:</p>\\n<pre><code>Fenced code block\\n</code></pre>'
-
-Works with safe_mode also (we check this because we are using the HtmlStash):
-
- >>> markdown.markdown(text, extensions=['fenced_code'], safe_mode='replace')
- u'<p>A paragraph before a fenced code block:</p>\\n<pre><code>Fenced code block\\n</code></pre>'
-
-Include tilde's in a code block and wrap with blank lines:
-
- >>> text = '''
- ... ~~~~~~~~
- ...
- ... ~~~~
- ...
- ... ~~~~~~~~'''
- >>> markdown.markdown(text, extensions=['fenced_code'])
- u'<pre><code>\\n~~~~\\n\\n</code></pre>'
-
-Multiple blocks and language tags:
-
- >>> text = '''
- ... ~~~~{.python}
- ... block one
- ... ~~~~
- ...
- ... ~~~~.html
- ... <p>block two</p>
- ... ~~~~'''
- >>> markdown.markdown(text, extensions=['fenced_code'])
- u'<pre><code class="python">block one\\n</code></pre>\\n\\n<pre><code class="html">&lt;p&gt;block two&lt;/p&gt;\\n</code></pre>'
-
-Copyright 2007-2008 [Waylan Limberg](http://achinghead.com/).
-
-Project website: <http://www.freewisdom.org/project/python-markdown/Fenced__Code__Blocks>
-Contact: markdown@freewisdom.org
-
-License: BSD (see ../docs/LICENSE for details)
-
-Dependencies:
-* [Python 2.3+](http://python.org)
-* [Markdown 2.0+](http://www.freewisdom.org/projects/python-markdown/)
+See <https://Python-Markdown.github.io/extensions/fenced_code_blocks>
+for documentation.
+
+Original code Copyright 2007-2008 [Waylan Limberg](http://achinghead.com/).
+
+
+All changes Copyright 2008-2014 The Python Markdown Project
+License: [BSD](https://opensource.org/licenses/bsd-license.php)
"""
-import markdown, re
-# Global vars
-FENCED_BLOCK_RE = re.compile( \
- r'(?P<fence>^~{3,})[ ]*(\{?\.(?P<lang>[a-zA-Z0-9_-]*)\}?)?[ ]*\n(?P<code>.*?)(?P=fence)[ ]*$',
- re.MULTILINE|re.DOTALL
- )
-CODE_WRAP = '<pre><code%s>%s</code></pre>'
-LANG_TAG = ' class="%s"'
+from textwrap import dedent
+from . import Extension
+from ..preprocessors import Preprocessor
+from .codehilite import CodeHilite, CodeHiliteExtension, parse_hl_lines
+from .attr_list import get_attrs, AttrListExtension
+from ..util import parseBoolValue
+from ..serializers import _escape_attrib_html
+import re
-class FencedCodeExtension(markdown.Extension):
+class FencedCodeExtension(Extension):
+ def __init__(self, **kwargs):
+ self.config = {
+ 'lang_prefix': ['language-', 'Prefix prepended to the language. Default: "language-"']
+ }
+ super().__init__(**kwargs)
- def extendMarkdown(self, md, md_globals):
+ def extendMarkdown(self, md):
""" Add FencedBlockPreprocessor to the Markdown instance. """
+ md.registerExtension(self)
+
+ md.preprocessors.register(FencedBlockPreprocessor(md, self.getConfigs()), 'fenced_code_block', 25)
+
+
+class FencedBlockPreprocessor(Preprocessor):
+ FENCED_BLOCK_RE = re.compile(
+ dedent(r'''
+ (?P<fence>^(?:~{3,}|`{3,}))[ ]* # opening fence
+ ((\{(?P<attrs>[^\}\n]*)\})| # (optional {attrs} or
+ (\.?(?P<lang>[\w#.+-]*)[ ]*)? # optional (.)lang
+ (hl_lines=(?P<quot>"|')(?P<hl_lines>.*?)(?P=quot)[ ]*)?) # optional hl_lines)
+ \n # newline (end of opening fence)
+ (?P<code>.*?)(?<=\n) # the code block
+ (?P=fence)[ ]*$ # closing fence
+ '''),
+ re.MULTILINE | re.DOTALL | re.VERBOSE
+ )
- md.preprocessors.add('fenced_code_block',
- FencedBlockPreprocessor(md),
- "_begin")
-
+ def __init__(self, md, config):
+ super().__init__(md)
+ self.config = config
+ self.checked_for_deps = False
+ self.codehilite_conf = {}
+ self.use_attr_list = False
+ # List of options to convert to bool values
+ self.bool_options = [
+ 'linenums',
+ 'guess_lang',
+ 'noclasses',
+ 'use_pygments'
+ ]
-class FencedBlockPreprocessor(markdown.preprocessors.Preprocessor):
-
def run(self, lines):
""" Match and store Fenced Code Blocks in the HtmlStash. """
+
+ # Check for dependent extensions
+ if not self.checked_for_deps:
+ for ext in self.md.registeredExtensions:
+ if isinstance(ext, CodeHiliteExtension):
+ self.codehilite_conf = ext.getConfigs()
+ if isinstance(ext, AttrListExtension):
+ self.use_attr_list = True
+
+ self.checked_for_deps = True
+
text = "\n".join(lines)
while 1:
- m = FENCED_BLOCK_RE.search(text)
+ m = self.FENCED_BLOCK_RE.search(text)
if m:
- lang = ''
- if m.group('lang'):
- lang = LANG_TAG % m.group('lang')
- code = CODE_WRAP % (lang, self._escape(m.group('code')))
- placeholder = self.markdown.htmlStash.store(code, safe=True)
- text = '%s\n%s\n%s'% (text[:m.start()], placeholder, text[m.end():])
+ lang, id, classes, config = None, '', [], {}
+ if m.group('attrs'):
+ id, classes, config = self.handle_attrs(get_attrs(m.group('attrs')))
+ if len(classes):
+ lang = classes.pop(0)
+ else:
+ if m.group('lang'):
+ lang = m.group('lang')
+ if m.group('hl_lines'):
+ # Support hl_lines outside of attrs for backward-compatibility
+ config['hl_lines'] = parse_hl_lines(m.group('hl_lines'))
+
+ # If config is not empty, then the codehighlite extension
+ # is enabled, so we call it to highlight the code
+ if self.codehilite_conf and self.codehilite_conf['use_pygments'] and config.get('use_pygments', True):
+ local_config = self.codehilite_conf.copy()
+ local_config.update(config)
+ # Combine classes with cssclass. Ensure cssclass is at end
+ # as pygments appends a suffix under certain circumstances.
+ # Ignore ID as Pygments does not offer an option to set it.
+ if classes:
+ local_config['css_class'] = '{} {}'.format(
+ ' '.join(classes),
+ local_config['css_class']
+ )
+ highliter = CodeHilite(
+ m.group('code'),
+ lang=lang,
+ style=local_config.pop('pygments_style', 'default'),
+ **local_config
+ )
+
+ code = highliter.hilite(shebang=False)
+ else:
+ id_attr = lang_attr = class_attr = kv_pairs = ''
+ if lang:
+ prefix = self.config.get('lang_prefix', 'language-')
+ lang_attr = f' class="{prefix}{_escape_attrib_html(lang)}"'
+ if classes:
+ class_attr = f' class="{_escape_attrib_html(" ".join(classes))}"'
+ if id:
+ id_attr = f' id="{_escape_attrib_html(id)}"'
+ if self.use_attr_list and config and not config.get('use_pygments', False):
+ # Only assign key/value pairs to code element if attr_list ext is enabled, key/value pairs
+ # were defined on the code block, and the `use_pygments` key was not set to True. The
+ # `use_pygments` key could be either set to False or not defined. It is omitted from output.
+ kv_pairs = ''.join(
+ f' {k}="{_escape_attrib_html(v)}"' for k, v in config.items() if k != 'use_pygments'
+ )
+ code = self._escape(m.group('code'))
+ code = f'<pre{id_attr}{class_attr}><code{lang_attr}{kv_pairs}>{code}</code></pre>'
+
+ placeholder = self.md.htmlStash.store(code)
+ text = f'{text[:m.start()]}\n{placeholder}\n{text[m.end():]}'
else:
break
return text.split("\n")
+ def handle_attrs(self, attrs):
+ """ Return tuple: (id, [list, of, classes], {configs}) """
+ id = ''
+ classes = []
+ configs = {}
+ for k, v in attrs:
+ if k == 'id':
+ id = v
+ elif k == '.':
+ classes.append(v)
+ elif k == 'hl_lines':
+ configs[k] = parse_hl_lines(v)
+ elif k in self.bool_options:
+ configs[k] = parseBoolValue(v, fail_on_errors=False, preserve_none=True)
+ else:
+ configs[k] = v
+ return id, classes, configs
+
def _escape(self, txt):
""" basic html escaping """
txt = txt.replace('&', '&amp;')
@@ -108,10 +170,5 @@ class FencedBlockPreprocessor(markdown.preprocessors.Preprocessor):
return txt
-def makeExtension(configs=None):
- return FencedCodeExtension()
-
-
-if __name__ == "__main__":
- import doctest
- doctest.testmod()
+def makeExtension(**kwargs): # pragma: no cover
+ return FencedCodeExtension(**kwargs)
diff --git a/markdown/extensions/footnotes.py b/markdown/extensions/footnotes.py
index e1a9cda..96ed5c2 100644
--- a/markdown/extensions/footnotes.py
+++ b/markdown/extensions/footnotes.py
@@ -1,81 +1,126 @@
"""
-========================= FOOTNOTES =================================
+Footnotes Extension for Python-Markdown
+=======================================
-This section adds footnote handling to markdown. It can be used as
-an example for extending python-markdown with relatively complex
-functionality. While in this case the extension is included inside
-the module itself, it could just as easily be added from outside the
-module. Not that all markdown classes above are ignorant about
-footnotes. All footnote functionality is provided separately and
-then added to the markdown instance at the run time.
+Adds footnote handling to Python-Markdown.
-Footnote functionality is attached by calling extendMarkdown()
-method of FootnoteExtension. The method also registers the
-extension to allow it's state to be reset by a call to reset()
-method.
+See <https://Python-Markdown.github.io/extensions/footnotes>
+for documentation.
-Example:
- Footnotes[^1] have a label[^label] and a definition[^!DEF].
+Copyright The Python Markdown Project
- [^1]: This is a footnote
- [^label]: A footnote on "label"
- [^!DEF]: The footnote for definition
+License: [BSD](https://opensource.org/licenses/bsd-license.php)
"""
-import re, markdown
-from markdown import etree
+from . import Extension
+from ..blockprocessors import BlockProcessor
+from ..inlinepatterns import InlineProcessor
+from ..treeprocessors import Treeprocessor
+from ..postprocessors import Postprocessor
+from .. import util
+from collections import OrderedDict
+import re
+import copy
+import xml.etree.ElementTree as etree
-FN_BACKLINK_TEXT = "zz1337820767766393qq"
-NBSP_PLACEHOLDER = "qq3936677670287331zz"
-DEF_RE = re.compile(r'(\ ?\ ?\ ?)\[\^([^\]]*)\]:\s*(.*)')
-TABBED_RE = re.compile(r'((\t)|( ))(.*)')
+FN_BACKLINK_TEXT = util.STX + "zz1337820767766393qq" + util.ETX
+NBSP_PLACEHOLDER = util.STX + "qq3936677670287331zz" + util.ETX
+RE_REF_ID = re.compile(r'(fnref)(\d+)')
-class FootnoteExtension(markdown.Extension):
+
+class FootnoteExtension(Extension):
""" Footnote Extension. """
- def __init__ (self, configs):
+ def __init__(self, **kwargs):
""" Setup configs. """
- self.config = {'PLACE_MARKER':
- ["///Footnotes Go Here///",
- "The text string that marks where the footnotes go"],
- 'UNIQUE_IDS':
- [False,
- "Avoid name collisions across "
- "multiple calls to reset()."]}
- for key, value in configs:
- self.config[key][0] = value
+ self.config = {
+ 'PLACE_MARKER':
+ ["///Footnotes Go Here///",
+ "The text string that marks where the footnotes go"],
+ 'UNIQUE_IDS':
+ [False,
+ "Avoid name collisions across "
+ "multiple calls to reset()."],
+ "BACKLINK_TEXT":
+ ["&#8617;",
+ "The text string that links from the footnote "
+ "to the reader's place."],
+ "SUPERSCRIPT_TEXT":
+ ["{}",
+ "The text string that links from the reader's place "
+ "to the footnote."],
+ "BACKLINK_TITLE":
+ ["Jump back to footnote %d in the text",
+ "The text string used for the title HTML attribute "
+ "of the backlink. %d will be replaced by the "
+ "footnote number."],
+ "SEPARATOR":
+ [":",
+ "Footnote separator."]
+ }
+ super().__init__(**kwargs)
# In multiple invocations, emit links that don't get tangled.
self.unique_prefix = 0
+ self.found_refs = {}
+ self.used_refs = set()
self.reset()
- def extendMarkdown(self, md, md_globals):
+ def extendMarkdown(self, md):
""" Add pieces to Markdown. """
md.registerExtension(self)
self.parser = md.parser
- # Insert a preprocessor before ReferencePreprocessor
- md.preprocessors.add("footnote", FootnotePreprocessor(self),
- "<reference")
+ self.md = md
+ # Insert a blockprocessor before ReferencePreprocessor
+ md.parser.blockprocessors.register(FootnoteBlockProcessor(self), 'footnote', 17)
+
# Insert an inline pattern before ImageReferencePattern
- FOOTNOTE_RE = r'\[\^([^\]]*)\]' # blah blah [^1] blah
- md.inlinePatterns.add("footnote", FootnotePattern(FOOTNOTE_RE, self),
- "<reference")
+ FOOTNOTE_RE = r'\[\^([^\]]*)\]' # blah blah [^1] blah
+ md.inlinePatterns.register(FootnoteInlineProcessor(FOOTNOTE_RE, self), 'footnote', 175)
# Insert a tree-processor that would actually add the footnote div
- # This must be before the inline treeprocessor so inline patterns
- # run on the contents of the div.
- md.treeprocessors.add("footnote", FootnoteTreeprocessor(self),
- "<inline")
- # Insert a postprocessor after amp_substitute oricessor
- md.postprocessors.add("footnote", FootnotePostprocessor(self),
- ">amp_substitute")
+ # This must be before all other treeprocessors (i.e., inline and
+ # codehilite) so they can run on the the contents of the div.
+ md.treeprocessors.register(FootnoteTreeprocessor(self), 'footnote', 50)
+
+ # Insert a tree-processor that will run after inline is done.
+ # In this tree-processor we want to check our duplicate footnote tracker
+ # And add additional backrefs to the footnote pointing back to the
+ # duplicated references.
+ md.treeprocessors.register(FootnotePostTreeprocessor(self), 'footnote-duplicate', 15)
+
+ # Insert a postprocessor after amp_substitute processor
+ md.postprocessors.register(FootnotePostprocessor(self), 'footnote', 25)
def reset(self):
- """ Clear the footnotes on reset, and prepare for a distinct document. """
- self.footnotes = markdown.odict.OrderedDict()
+ """ Clear footnotes on reset, and prepare for distinct document. """
+ self.footnotes = OrderedDict()
self.unique_prefix += 1
+ self.found_refs = {}
+ self.used_refs = set()
+
+ def unique_ref(self, reference, found=False):
+ """ Get a unique reference if there are duplicates. """
+ if not found:
+ return reference
+
+ original_ref = reference
+ while reference in self.used_refs:
+ ref, rest = reference.split(self.get_separator(), 1)
+ m = RE_REF_ID.match(ref)
+ if m:
+ reference = '%s%d%s%s' % (m.group(1), int(m.group(2))+1, self.get_separator(), rest)
+ else:
+ reference = '%s%d%s%s' % (ref, 2, self.get_separator(), rest)
+
+ self.used_refs.add(reference)
+ if original_ref in self.found_refs:
+ self.found_refs[original_ref] += 1
+ else:
+ self.found_refs[original_ref] = 1
+ return reference
def findFootnotesPlaceholder(self, root):
""" Return ElementTree Element that contains Footnote placeholder. """
@@ -83,13 +128,15 @@ class FootnoteExtension(markdown.Extension):
for child in element:
if child.text:
if child.text.find(self.getConfig("PLACE_MARKER")) > -1:
- return child, True
+ return child, element, True
if child.tail:
if child.tail.find(self.getConfig("PLACE_MARKER")) > -1:
- return (child, element), False
- finder(child)
+ return child, element, False
+ child_res = finder(child)
+ if child_res is not None:
+ return child_res
return None
-
+
res = finder(root)
return res
@@ -97,43 +144,59 @@ class FootnoteExtension(markdown.Extension):
""" Store a footnote for later retrieval. """
self.footnotes[id] = text
+ def get_separator(self):
+ """ Get the footnote separator. """
+ return self.getConfig("SEPARATOR")
+
def makeFootnoteId(self, id):
""" Return footnote link id. """
if self.getConfig("UNIQUE_IDS"):
- return 'fn:%d-%s' % (self.unique_prefix, id)
+ return 'fn%s%d-%s' % (self.get_separator(), self.unique_prefix, id)
else:
- return 'fn:%s' % id
+ return 'fn{}{}'.format(self.get_separator(), id)
- def makeFootnoteRefId(self, id):
+ def makeFootnoteRefId(self, id, found=False):
""" Return footnote back-link id. """
if self.getConfig("UNIQUE_IDS"):
- return 'fnref:%d-%s' % (self.unique_prefix, id)
+ return self.unique_ref('fnref%s%d-%s' % (self.get_separator(), self.unique_prefix, id), found)
else:
- return 'fnref:%s' % id
+ return self.unique_ref('fnref{}{}'.format(self.get_separator(), id), found)
def makeFootnotesDiv(self, root):
""" Return div of footnotes as et Element. """
- if not self.footnotes.keys():
+ if not list(self.footnotes.keys()):
return None
div = etree.Element("div")
div.set('class', 'footnote')
- hr = etree.SubElement(div, "hr")
+ etree.SubElement(div, "hr")
ol = etree.SubElement(div, "ol")
+ surrogate_parent = etree.Element("div")
+
+ # Backward compatibility with old '%d' placeholder
+ backlink_title = self.getConfig("BACKLINK_TITLE").replace("%d", "{}")
- for id in self.footnotes.keys():
+ for index, id in enumerate(self.footnotes.keys(), start=1):
li = etree.SubElement(ol, "li")
li.set("id", self.makeFootnoteId(id))
- self.parser.parseChunk(li, self.footnotes[id])
+ # Parse footnote with surrogate parent as li cannot be used.
+ # List block handlers have special logic to deal with li.
+ # When we are done parsing, we will copy everything over to li.
+ self.parser.parseChunk(surrogate_parent, self.footnotes[id])
+ for el in list(surrogate_parent):
+ li.append(el)
+ surrogate_parent.remove(el)
backlink = etree.Element("a")
backlink.set("href", "#" + self.makeFootnoteRefId(id))
- backlink.set("rev", "footnote")
- backlink.set("title", "Jump back to footnote %d in the text" % \
- (self.footnotes.index(id)+1))
+ backlink.set("class", "footnote-backref")
+ backlink.set(
+ "title",
+ backlink_title.format(index)
+ )
backlink.text = FN_BACKLINK_TEXT
- if li.getchildren():
+ if len(li):
node = li[-1]
if node.tag == "p":
node.text = node.text + NBSP_PLACEHOLDER
@@ -144,164 +207,205 @@ class FootnoteExtension(markdown.Extension):
return div
-class FootnotePreprocessor(markdown.preprocessors.Preprocessor):
+class FootnoteBlockProcessor(BlockProcessor):
""" Find all footnote references and store for later use. """
- def __init__ (self, footnotes):
- self.footnotes = footnotes
-
- def run(self, lines):
- lines = self._handleFootnoteDefinitions(lines)
- text = "\n".join(lines)
- return text.split("\n")
+ RE = re.compile(r'^[ ]{0,3}\[\^([^\]]*)\]:[ ]*(.*)$', re.MULTILINE)
- def _handleFootnoteDefinitions(self, lines):
- """
- Recursively find all footnote definitions in lines.
-
- Keywords:
+ def __init__(self, footnotes):
+ super().__init__(footnotes.parser)
+ self.footnotes = footnotes
- * lines: A list of lines of text
-
- Return: A list of lines with footnote definitions removed.
-
- """
- i, id, footnote = self._findFootnoteDefinition(lines)
-
- if id :
- plain = lines[:i]
- detabbed, theRest = self.detectTabbed(lines[i+1:])
- self.footnotes.setFootnote(id,
- footnote + "\n"
- + "\n".join(detabbed))
- more_plain = self._handleFootnoteDefinitions(theRest)
- return plain + [""] + more_plain
- else :
- return lines
-
- def _findFootnoteDefinition(self, lines):
- """
- Find the parts of a footnote definition.
+ def test(self, parent, block):
+ return True
+
+ def run(self, parent, blocks):
+ """ Find, set, and remove footnote definitions. """
+ block = blocks.pop(0)
+ m = self.RE.search(block)
+ if m:
+ id = m.group(1)
+ fn_blocks = [m.group(2)]
+
+ # Handle rest of block
+ therest = block[m.end():].lstrip('\n')
+ m2 = self.RE.search(therest)
+ if m2:
+ # Another footnote exists in the rest of this block.
+ # Any content before match is continuation of this footnote, which may be lazily indented.
+ before = therest[:m2.start()].rstrip('\n')
+ fn_blocks[0] = '\n'.join([fn_blocks[0], self.detab(before)]).lstrip('\n')
+ # Add back to blocks everything from beginning of match forward for next iteration.
+ blocks.insert(0, therest[m2.start():])
+ else:
+ # All remaining lines of block are continuation of this footnote, which may be lazily indented.
+ fn_blocks[0] = '\n'.join([fn_blocks[0], self.detab(therest)]).strip('\n')
- Keywords:
+ # Check for child elements in remaining blocks.
+ fn_blocks.extend(self.detectTabbed(blocks))
- * lines: A list of lines of text.
+ footnote = "\n\n".join(fn_blocks)
+ self.footnotes.setFootnote(id, footnote.rstrip())
- Return: A three item tuple containing the index of the first line of a
- footnote definition, the id of the definition and the body of the
- definition.
-
- """
- counter = 0
- for line in lines:
- m = DEF_RE.match(line)
- if m:
- return counter, m.group(2), m.group(3)
- counter += 1
- return counter, None, None
+ if block[:m.start()].strip():
+ # Add any content before match back to blocks as separate block
+ blocks.insert(0, block[:m.start()].rstrip('\n'))
+ return True
+ # No match. Restore block.
+ blocks.insert(0, block)
+ return False
- def detectTabbed(self, lines):
+ def detectTabbed(self, blocks):
""" Find indented text and remove indent before further proccesing.
- Keyword arguments:
-
- * lines: an array of strings
-
- Returns: a list of post processed items and the unused
- remainder of the original list
-
+ Returns: a list of blocks with indentation removed.
"""
- items = []
- item = -1
- i = 0 # to keep track of where we are
-
- def detab(line):
- match = TABBED_RE.match(line)
- if match:
- return match.group(4)
-
- for line in lines:
- if line.strip(): # Non-blank line
- line = detab(line)
- if line:
- items.append(line)
- i += 1
- continue
- else:
- return items, lines[i:]
-
- else: # Blank line: _maybe_ we are done.
- i += 1 # advance
-
- # Find the next non-blank line
- for j in range(i, len(lines)):
- if lines[j].strip():
- next_line = lines[j]; break
+ fn_blocks = []
+ while blocks:
+ if blocks[0].startswith(' '*4):
+ block = blocks.pop(0)
+ # Check for new footnotes within this block and split at new footnote.
+ m = self.RE.search(block)
+ if m:
+ # Another footnote exists in this block.
+ # Any content before match is continuation of this footnote, which may be lazily indented.
+ before = block[:m.start()].rstrip('\n')
+ fn_blocks.append(self.detab(before))
+ # Add back to blocks everything from beginning of match forward for next iteration.
+ blocks.insert(0, block[m.start():])
+ # End of this footnote.
+ break
else:
- break # There is no more text; we are done.
+ # Entire block is part of this footnote.
+ fn_blocks.append(self.detab(block))
+ else:
+ # End of this footnote.
+ break
+ return fn_blocks
- # Check if the next non-blank line is tabbed
- if detab(next_line): # Yes, more work to do.
- items.append("")
- continue
- else:
- break # No, we are done.
- else:
- i += 1
+ def detab(self, block):
+ """ Remove one level of indent from a block.
- return items, lines[i:]
+ Preserve lazily indented blocks by only removing indent from indented lines.
+ """
+ lines = block.split('\n')
+ for i, line in enumerate(lines):
+ if line.startswith(' '*4):
+ lines[i] = line[4:]
+ return '\n'.join(lines)
-class FootnotePattern(markdown.inlinepatterns.Pattern):
+class FootnoteInlineProcessor(InlineProcessor):
""" InlinePattern for footnote markers in a document's body text. """
def __init__(self, pattern, footnotes):
- markdown.inlinepatterns.Pattern.__init__(self, pattern)
+ super().__init__(pattern)
self.footnotes = footnotes
- def handleMatch(self, m):
- sup = etree.Element("sup")
- a = etree.SubElement(sup, "a")
- id = m.group(2)
- sup.set('id', self.footnotes.makeFootnoteRefId(id))
- a.set('href', '#' + self.footnotes.makeFootnoteId(id))
- a.set('rel', 'footnote')
- a.text = str(self.footnotes.footnotes.index(id) + 1)
- return sup
+ def handleMatch(self, m, data):
+ id = m.group(1)
+ if id in self.footnotes.footnotes.keys():
+ sup = etree.Element("sup")
+ a = etree.SubElement(sup, "a")
+ sup.set('id', self.footnotes.makeFootnoteRefId(id, found=True))
+ a.set('href', '#' + self.footnotes.makeFootnoteId(id))
+ a.set('class', 'footnote-ref')
+ a.text = self.footnotes.getConfig("SUPERSCRIPT_TEXT").format(
+ list(self.footnotes.footnotes.keys()).index(id) + 1
+ )
+ return sup, m.start(0), m.end(0)
+ else:
+ return None, None, None
+
+
+class FootnotePostTreeprocessor(Treeprocessor):
+ """ Amend footnote div with duplicates. """
+
+ def __init__(self, footnotes):
+ self.footnotes = footnotes
+ def add_duplicates(self, li, duplicates):
+ """ Adjust current li and add the duplicates: fnref2, fnref3, etc. """
+ for link in li.iter('a'):
+ # Find the link that needs to be duplicated.
+ if link.attrib.get('class', '') == 'footnote-backref':
+ ref, rest = link.attrib['href'].split(self.footnotes.get_separator(), 1)
+ # Duplicate link the number of times we need to
+ # and point the to the appropriate references.
+ links = []
+ for index in range(2, duplicates + 1):
+ sib_link = copy.deepcopy(link)
+ sib_link.attrib['href'] = '%s%d%s%s' % (ref, index, self.footnotes.get_separator(), rest)
+ links.append(sib_link)
+ self.offset += 1
+ # Add all the new duplicate links.
+ el = list(li)[-1]
+ for link in links:
+ el.append(link)
+ break
+
+ def get_num_duplicates(self, li):
+ """ Get the number of duplicate refs of the footnote. """
+ fn, rest = li.attrib.get('id', '').split(self.footnotes.get_separator(), 1)
+ link_id = '{}ref{}{}'.format(fn, self.footnotes.get_separator(), rest)
+ return self.footnotes.found_refs.get(link_id, 0)
+
+ def handle_duplicates(self, parent):
+ """ Find duplicate footnotes and format and add the duplicates. """
+ for li in list(parent):
+ # Check number of duplicates footnotes and insert
+ # additional links if needed.
+ count = self.get_num_duplicates(li)
+ if count > 1:
+ self.add_duplicates(li, count)
-class FootnoteTreeprocessor(markdown.treeprocessors.Treeprocessor):
+ def run(self, root):
+ """ Crawl the footnote div and add missing duplicate footnotes. """
+ self.offset = 0
+ for div in root.iter('div'):
+ if div.attrib.get('class', '') == 'footnote':
+ # Footnotes should be under the first ordered list under
+ # the footnote div. So once we find it, quit.
+ for ol in div.iter('ol'):
+ self.handle_duplicates(ol)
+ break
+
+
+class FootnoteTreeprocessor(Treeprocessor):
""" Build and append footnote div to end of document. """
- def __init__ (self, footnotes):
+ def __init__(self, footnotes):
self.footnotes = footnotes
def run(self, root):
footnotesDiv = self.footnotes.makeFootnotesDiv(root)
- if footnotesDiv:
+ if footnotesDiv is not None:
result = self.footnotes.findFootnotesPlaceholder(root)
if result:
- node, isText = result
+ child, parent, isText = result
+ ind = list(parent).index(child)
if isText:
- node.text = None
- node.getchildren().insert(0, footnotesDiv)
+ parent.remove(child)
+ parent.insert(ind, footnotesDiv)
else:
- child, element = node
- ind = element.getchildren().find(child)
- element.getchildren().insert(ind + 1, footnotesDiv)
+ parent.insert(ind + 1, footnotesDiv)
child.tail = None
- fnPlaceholder.parent.replaceChild(fnPlaceholder, footnotesDiv)
else:
root.append(footnotesDiv)
-class FootnotePostprocessor(markdown.postprocessors.Postprocessor):
+
+class FootnotePostprocessor(Postprocessor):
""" Replace placeholders with html entities. """
+ def __init__(self, footnotes):
+ self.footnotes = footnotes
def run(self, text):
- text = text.replace(FN_BACKLINK_TEXT, "&#8617;")
+ text = text.replace(
+ FN_BACKLINK_TEXT, self.footnotes.getConfig("BACKLINK_TEXT")
+ )
return text.replace(NBSP_PLACEHOLDER, "&#160;")
-def makeExtension(configs=[]):
- """ Return an instance of the FootnoteExtension """
- return FootnoteExtension(configs=configs)
+def makeExtension(**kwargs): # pragma: no cover
+ """ Return an instance of the FootnoteExtension """
+ return FootnoteExtension(**kwargs)
diff --git a/markdown/extensions/headerid.py b/markdown/extensions/headerid.py
deleted file mode 100644
index f70a7a9..0000000
--- a/markdown/extensions/headerid.py
+++ /dev/null
@@ -1,195 +0,0 @@
-#!/usr/bin/python
-
-"""
-HeaderID Extension for Python-Markdown
-======================================
-
-Adds ability to set HTML IDs for headers.
-
-Basic usage:
-
- >>> import markdown
- >>> text = "# Some Header # {#some_id}"
- >>> md = markdown.markdown(text, ['headerid'])
- >>> md
- u'<h1 id="some_id">Some Header</h1>'
-
-All header IDs are unique:
-
- >>> text = '''
- ... #Header
- ... #Another Header {#header}
- ... #Third Header {#header}'''
- >>> md = markdown.markdown(text, ['headerid'])
- >>> md
- u'<h1 id="header">Header</h1>\\n<h1 id="header_1">Another Header</h1>\\n<h1 id="header_2">Third Header</h1>'
-
-To fit within a html template's hierarchy, set the header base level:
-
- >>> text = '''
- ... #Some Header
- ... ## Next Level'''
- >>> md = markdown.markdown(text, ['headerid(level=3)'])
- >>> md
- u'<h3 id="some_header">Some Header</h3>\\n<h4 id="next_level">Next Level</h4>'
-
-Turn off auto generated IDs:
-
- >>> text = '''
- ... # Some Header
- ... # Header with ID # { #foo }'''
- >>> md = markdown.markdown(text, ['headerid(forceid=False)'])
- >>> md
- u'<h1>Some Header</h1>\\n<h1 id="foo">Header with ID</h1>'
-
-Use with MetaData extension:
-
- >>> text = '''header_level: 2
- ... header_forceid: Off
- ...
- ... # A Header'''
- >>> md = markdown.markdown(text, ['headerid', 'meta'])
- >>> md
- u'<h2>A Header</h2>'
-
-Copyright 2007-2008 [Waylan Limberg](http://achinghead.com/).
-
-Project website: <http://www.freewisdom.org/project/python-markdown/HeaderId>
-Contact: markdown@freewisdom.org
-
-License: BSD (see ../docs/LICENSE for details)
-
-Dependencies:
-* [Python 2.3+](http://python.org)
-* [Markdown 2.0+](http://www.freewisdom.org/projects/python-markdown/)
-
-"""
-
-import markdown
-from markdown import etree
-import re
-from string import ascii_lowercase, digits, punctuation
-
-ID_CHARS = ascii_lowercase + digits + '-_'
-IDCOUNT_RE = re.compile(r'^(.*)_([0-9]+)$')
-
-
-class HeaderIdProcessor(markdown.blockprocessors.BlockProcessor):
- """ Replacement BlockProcessor for Header IDs. """
-
- # Detect a header at start of any line in block
- RE = re.compile(r"""(^|\n)
- (?P<level>\#{1,6}) # group('level') = string of hashes
- (?P<header>.*?) # group('header') = Header text
- \#* # optional closing hashes
- (?:[ \t]*\{[ \t]*\#(?P<id>[-_:a-zA-Z0-9]+)[ \t]*\})?
- (\n|$) # ^^ group('id') = id attribute
- """,
- re.VERBOSE)
-
- IDs = []
-
- def test(self, parent, block):
- return bool(self.RE.search(block))
-
- def run(self, parent, blocks):
- block = blocks.pop(0)
- m = self.RE.search(block)
- if m:
- before = block[:m.start()] # All lines before header
- after = block[m.end():] # All lines after header
- if before:
- # As the header was not the first line of the block and the
- # lines before the header must be parsed first,
- # recursively parse this lines as a block.
- self.parser.parseBlocks(parent, [before])
- # Create header using named groups from RE
- start_level, force_id = self._get_meta()
- level = len(m.group('level')) + start_level
- if level > 6:
- level = 6
- h = markdown.etree.SubElement(parent, 'h%d' % level)
- h.text = m.group('header').strip()
- if m.group('id'):
- h.set('id', self._unique_id(m.group('id')))
- elif force_id:
- h.set('id', self._create_id(m.group('header').strip()))
- if after:
- # Insert remaining lines as first block for future parsing.
- blocks.insert(0, after)
- else:
- # This should never happen, but just in case...
- message(CRITICAL, "We've got a problem header!")
-
- def _get_meta(self):
- """ Return meta data suported by this ext as a tuple """
- level = int(self.config['level'][0]) - 1
- force = self._str2bool(self.config['forceid'][0])
- if hasattr(self.md, 'Meta'):
- if self.md.Meta.has_key('header_level'):
- level = int(self.md.Meta['header_level'][0]) - 1
- if self.md.Meta.has_key('header_forceid'):
- force = self._str2bool(self.md.Meta['header_forceid'][0])
- return level, force
-
- def _str2bool(self, s, default=False):
- """ Convert a string to a booleen value. """
- s = str(s)
- if s.lower() in ['0', 'f', 'false', 'off', 'no', 'n']:
- return False
- elif s.lower() in ['1', 't', 'true', 'on', 'yes', 'y']:
- return True
- return default
-
- def _unique_id(self, id):
- """ Ensure ID is unique. Append '_1', '_2'... if not """
- while id in self.IDs:
- m = IDCOUNT_RE.match(id)
- if m:
- id = '%s_%d'% (m.group(1), int(m.group(2))+1)
- else:
- id = '%s_%d'% (id, 1)
- self.IDs.append(id)
- return id
-
- def _create_id(self, header):
- """ Return ID from Header text. """
- h = ''
- for c in header.lower().replace(' ', '_'):
- if c in ID_CHARS:
- h += c
- elif c not in punctuation:
- h += '+'
- return self._unique_id(h)
-
-
-class HeaderIdExtension (markdown.Extension):
- def __init__(self, configs):
- # set defaults
- self.config = {
- 'level' : ['1', 'Base level for headers.'],
- 'forceid' : ['True', 'Force all headers to have an id.']
- }
-
- for key, value in configs:
- self.setConfig(key, value)
-
- def extendMarkdown(self, md, md_globals):
- md.registerExtension(self)
- self.processor = HeaderIdProcessor(md.parser)
- self.processor.md = md
- self.processor.config = self.config
- # Replace existing hasheader in place.
- md.parser.blockprocessors['hashheader'] = self.processor
-
- def reset(self):
- self.processor.IDs = []
-
-
-def makeExtension(configs=None):
- return HeaderIdExtension(configs=configs)
-
-if __name__ == "__main__":
- import doctest
- doctest.testmod()
-
diff --git a/markdown/extensions/html_tidy.py b/markdown/extensions/html_tidy.py
deleted file mode 100644
index 5105e33..0000000
--- a/markdown/extensions/html_tidy.py
+++ /dev/null
@@ -1,62 +0,0 @@
-#!/usr/bin/env python
-
-"""
-HTML Tidy Extension for Python-Markdown
-=======================================
-
-Runs [HTML Tidy][] on the output of Python-Markdown using the [uTidylib][]
-Python wrapper. Both libtidy and uTidylib must be installed on your system.
-
-Note than any Tidy [options][] can be passed in as extension configs. So,
-for example, to output HTML rather than XHTML, set ``output_xhtml=0``. To
-indent the output, set ``indent=auto`` and to have Tidy wrap the output in
-``<html>`` and ``<body>`` tags, set ``show_body_only=0``.
-
-[HTML Tidy]: http://tidy.sourceforge.net/
-[uTidylib]: http://utidylib.berlios.de/
-[options]: http://tidy.sourceforge.net/docs/quickref.html
-
-Copyright (c)2008 [Waylan Limberg](http://achinghead.com)
-
-License: [BSD](http://www.opensource.org/licenses/bsd-license.php)
-
-Dependencies:
-* [Python2.3+](http://python.org)
-* [Markdown 2.0+](http://www.freewisdom.org/projects/python-markdown/)
-* [HTML Tidy](http://utidylib.berlios.de/)
-* [uTidylib](http://utidylib.berlios.de/)
-
-"""
-
-import markdown
-import tidy
-
-class TidyExtension(markdown.Extension):
-
- def __init__(self, configs):
- # Set defaults to match typical markdown behavior.
- self.config = dict(output_xhtml=1,
- show_body_only=1,
- )
- # Merge in user defined configs overriding any present if nessecary.
- for c in configs:
- self.config[c[0]] = c[1]
-
- def extendMarkdown(self, md, md_globals):
- # Save options to markdown instance
- md.tidy_options = self.config
- # Add TidyProcessor to postprocessors
- md.postprocessors['tidy'] = TidyProcessor(md)
-
-
-class TidyProcessor(markdown.postprocessors.Postprocessor):
-
- def run(self, text):
- # Pass text to Tidy. As Tidy does not accept unicode we need to encode
- # it and decode its return value.
- return unicode(tidy.parseString(text.encode('utf-8'),
- **self.markdown.tidy_options))
-
-
-def makeExtension(configs=None):
- return TidyExtension(configs=configs)
diff --git a/markdown/extensions/imagelinks.py b/markdown/extensions/imagelinks.py
deleted file mode 100644
index ee0b708..0000000
--- a/markdown/extensions/imagelinks.py
+++ /dev/null
@@ -1,119 +0,0 @@
-"""
-========================= IMAGE LINKS =================================
-
-
-Turns paragraphs like
-
-<~~~~~~~~~~~~~~~~~~~~~~~~
-dir/subdir
-dir/subdir
-dir/subdir
-~~~~~~~~~~~~~~
-dir/subdir
-dir/subdir
-dir/subdir
-~~~~~~~~~~~~~~~~~~~>
-
-Into mini-photo galleries.
-
-"""
-
-import re, markdown
-import url_manager
-
-
-IMAGE_LINK = """<a href="%s"><img src="%s" title="%s"/></a>"""
-SLIDESHOW_LINK = """<a href="%s" target="_blank">[slideshow]</a>"""
-ALBUM_LINK = """&nbsp;<a href="%s">[%s]</a>"""
-
-
-class ImageLinksExtension(markdown.Extension):
-
- def extendMarkdown(self, md, md_globals):
-
- md.preprocessors.add("imagelink", ImageLinkPreprocessor(md), "_begin")
-
-
-class ImageLinkPreprocessor(markdown.preprocessors.Preprocessor):
-
- def run(self, lines):
-
- url = url_manager.BlogEntryUrl(url_manager.BlogUrl("all"),
- "2006/08/29/the_rest_of_our")
-
-
- all_images = []
- blocks = []
- in_image_block = False
-
- new_lines = []
-
- for line in lines:
-
- if line.startswith("<~~~~~~~"):
- albums = []
- rows = []
- in_image_block = True
-
- if not in_image_block:
-
- new_lines.append(line)
-
- else:
-
- line = line.strip()
-
- if line.endswith("~~~~~~>") or not line:
- in_image_block = False
- new_block = "<div><br/><center><span class='image-links'>\n"
-
- album_url_hash = {}
-
- for row in rows:
- for photo_url, title in row:
- new_block += "&nbsp;"
- new_block += IMAGE_LINK % (photo_url,
- photo_url.get_thumbnail(),
- title)
-
- album_url_hash[str(photo_url.get_album())] = 1
-
- new_block += "<br/>"
-
- new_block += "</span>"
- new_block += SLIDESHOW_LINK % url.get_slideshow()
-
- album_urls = album_url_hash.keys()
- album_urls.sort()
-
- if len(album_urls) == 1:
- new_block += ALBUM_LINK % (album_urls[0], "complete album")
- else :
- for i in range(len(album_urls)) :
- new_block += ALBUM_LINK % (album_urls[i],
- "album %d" % (i + 1) )
-
- new_lines.append(new_block + "</center><br/></div>")
-
- elif line[1:6] == "~~~~~" :
- rows.append([]) # start a new row
- else :
- parts = line.split()
- line = parts[0]
- title = " ".join(parts[1:])
-
- album, photo = line.split("/")
- photo_url = url.get_photo(album, photo,
- len(all_images)+1)
- all_images.append(photo_url)
- rows[-1].append((photo_url, title))
-
- if not album in albums :
- albums.append(album)
-
- return new_lines
-
-
-def makeExtension(configs):
- return ImageLinksExtension(configs)
-
diff --git a/markdown/extensions/legacy_attrs.py b/markdown/extensions/legacy_attrs.py
new file mode 100644
index 0000000..445aba1
--- /dev/null
+++ b/markdown/extensions/legacy_attrs.py
@@ -0,0 +1,67 @@
+"""
+Python Markdown
+
+A Python implementation of John Gruber's Markdown.
+
+Documentation: https://python-markdown.github.io/
+GitHub: https://github.com/Python-Markdown/markdown/
+PyPI: https://pypi.org/project/Markdown/
+
+Started by Manfred Stienstra (http://www.dwerg.net/).
+Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
+Currently maintained by Waylan Limberg (https://github.com/waylan),
+Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
+
+Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later)
+Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
+Copyright 2004 Manfred Stienstra (the original version)
+
+License: BSD (see LICENSE.md for details).
+
+Legacy Attributes Extension
+===========================
+
+An extension to Python Markdown which implements legacy attributes.
+
+Prior to Python-Markdown version 3.0, the Markdown class had an `enable_attributes`
+keyword which was on by default and provided for attributes to be defined for elements
+using the format `{@key=value}`. This extension is provided as a replacement for
+backward compatibility. New documents should be authored using attr_lists. However,
+numerious documents exist which have been using the old attribute format for many
+years. This extension can be used to continue to render those documents correctly.
+"""
+
+import re
+from markdown.treeprocessors import Treeprocessor, isString
+from markdown.extensions import Extension
+
+
+ATTR_RE = re.compile(r'\{@([^\}]*)=([^\}]*)}') # {@id=123}
+
+
+class LegacyAttrs(Treeprocessor):
+ def run(self, doc):
+ """Find and set values of attributes ({@key=value}). """
+ for el in doc.iter():
+ alt = el.get('alt', None)
+ if alt is not None:
+ el.set('alt', self.handleAttributes(el, alt))
+ if el.text and isString(el.text):
+ el.text = self.handleAttributes(el, el.text)
+ if el.tail and isString(el.tail):
+ el.tail = self.handleAttributes(el, el.tail)
+
+ def handleAttributes(self, el, txt):
+ """ Set attributes and return text without definitions. """
+ def attributeCallback(match):
+ el.set(match.group(1), match.group(2).replace('\n', ' '))
+ return ATTR_RE.sub(attributeCallback, txt)
+
+
+class LegacyAttrExtension(Extension):
+ def extendMarkdown(self, md):
+ md.treeprocessors.register(LegacyAttrs(md), 'legacyattrs', 15)
+
+
+def makeExtension(**kwargs): # pragma: no cover
+ return LegacyAttrExtension(**kwargs)
diff --git a/markdown/extensions/legacy_em.py b/markdown/extensions/legacy_em.py
new file mode 100644
index 0000000..360988b
--- /dev/null
+++ b/markdown/extensions/legacy_em.py
@@ -0,0 +1,49 @@
+'''
+Legacy Em Extension for Python-Markdown
+=======================================
+
+This extension provides legacy behavior for _connected_words_.
+
+Copyright 2015-2018 The Python Markdown Project
+
+License: [BSD](https://opensource.org/licenses/bsd-license.php)
+
+'''
+
+from . import Extension
+from ..inlinepatterns import UnderscoreProcessor, EmStrongItem, EM_STRONG2_RE, STRONG_EM2_RE
+import re
+
+# _emphasis_
+EMPHASIS_RE = r'(_)([^_]+)\1'
+
+# __strong__
+STRONG_RE = r'(_{2})(.+?)\1'
+
+# __strong_em___
+STRONG_EM_RE = r'(_)\1(?!\1)([^_]+?)\1(?!\1)(.+?)\1{3}'
+
+
+class LegacyUnderscoreProcessor(UnderscoreProcessor):
+ """Emphasis processor for handling strong and em matches inside underscores."""
+
+ PATTERNS = [
+ EmStrongItem(re.compile(EM_STRONG2_RE, re.DOTALL | re.UNICODE), 'double', 'strong,em'),
+ EmStrongItem(re.compile(STRONG_EM2_RE, re.DOTALL | re.UNICODE), 'double', 'em,strong'),
+ EmStrongItem(re.compile(STRONG_EM_RE, re.DOTALL | re.UNICODE), 'double2', 'strong,em'),
+ EmStrongItem(re.compile(STRONG_RE, re.DOTALL | re.UNICODE), 'single', 'strong'),
+ EmStrongItem(re.compile(EMPHASIS_RE, re.DOTALL | re.UNICODE), 'single', 'em')
+ ]
+
+
+class LegacyEmExtension(Extension):
+ """ Add legacy_em extension to Markdown class."""
+
+ def extendMarkdown(self, md):
+ """ Modify inline patterns. """
+ md.inlinePatterns.register(LegacyUnderscoreProcessor(r'_'), 'em_strong2', 50)
+
+
+def makeExtension(**kwargs): # pragma: no cover
+ """ Return an instance of the LegacyEmExtension """
+ return LegacyEmExtension(**kwargs)
diff --git a/markdown/extensions/md_in_html.py b/markdown/extensions/md_in_html.py
new file mode 100644
index 0000000..ec7dcba
--- /dev/null
+++ b/markdown/extensions/md_in_html.py
@@ -0,0 +1,364 @@
+"""
+Python-Markdown Markdown in HTML Extension
+===============================
+
+An implementation of [PHP Markdown Extra](http://michelf.com/projects/php-markdown/extra/)'s
+parsing of Markdown syntax in raw HTML.
+
+See <https://Python-Markdown.github.io/extensions/raw_html>
+for documentation.
+
+Copyright The Python Markdown Project
+
+License: [BSD](https://opensource.org/licenses/bsd-license.php)
+
+"""
+
+from . import Extension
+from ..blockprocessors import BlockProcessor
+from ..preprocessors import Preprocessor
+from ..postprocessors import RawHtmlPostprocessor
+from .. import util
+from ..htmlparser import HTMLExtractor, blank_line_re
+import xml.etree.ElementTree as etree
+
+
+class HTMLExtractorExtra(HTMLExtractor):
+ """
+ Override HTMLExtractor and create etree Elements for any elements which should have content parsed as Markdown.
+ """
+
+ def __init__(self, md, *args, **kwargs):
+ # All block-level tags.
+ self.block_level_tags = set(md.block_level_elements.copy())
+ # Block-level tags in which the content only gets span level parsing
+ self.span_tags = set(
+ ['address', 'dd', 'dt', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'legend', 'li', 'p', 'summary', 'td', 'th']
+ )
+ # Block-level tags which never get their content parsed.
+ self.raw_tags = set(['canvas', 'math', 'option', 'pre', 'script', 'style', 'textarea'])
+
+ super().__init__(md, *args, **kwargs)
+
+ # Block-level tags in which the content gets parsed as blocks
+ self.block_tags = set(self.block_level_tags) - (self.span_tags | self.raw_tags | self.empty_tags)
+ self.span_and_blocks_tags = self.block_tags | self.span_tags
+
+ def reset(self):
+ """Reset this instance. Loses all unprocessed data."""
+ self.mdstack = [] # When markdown=1, stack contains a list of tags
+ self.treebuilder = etree.TreeBuilder()
+ self.mdstate = [] # one of 'block', 'span', 'off', or None
+ super().reset()
+
+ def close(self):
+ """Handle any buffered data."""
+ super().close()
+ # Handle any unclosed tags.
+ if self.mdstack:
+ # Close the outermost parent. handle_endtag will close all unclosed children.
+ self.handle_endtag(self.mdstack[0])
+
+ def get_element(self):
+ """ Return element from treebuilder and reset treebuilder for later use. """
+ element = self.treebuilder.close()
+ self.treebuilder = etree.TreeBuilder()
+ return element
+
+ def get_state(self, tag, attrs):
+ """ Return state from tag and `markdown` attr. One of 'block', 'span', or 'off'. """
+ md_attr = attrs.get('markdown', '0')
+ if md_attr == 'markdown':
+ # `<tag markdown>` is the same as `<tag markdown='1'>`.
+ md_attr = '1'
+ parent_state = self.mdstate[-1] if self.mdstate else None
+ if parent_state == 'off' or (parent_state == 'span' and md_attr != '0'):
+ # Only use the parent state if it is more restrictive than the markdown attribute.
+ md_attr = parent_state
+ if ((md_attr == '1' and tag in self.block_tags) or
+ (md_attr == 'block' and tag in self.span_and_blocks_tags)):
+ return 'block'
+ elif ((md_attr == '1' and tag in self.span_tags) or
+ (md_attr == 'span' and tag in self.span_and_blocks_tags)):
+ return 'span'
+ elif tag in self.block_level_tags:
+ return 'off'
+ else: # pragma: no cover
+ return None
+
+ def handle_starttag(self, tag, attrs):
+ # Handle tags that should always be empty and do not specify a closing tag
+ if tag in self.empty_tags and (self.at_line_start() or self.intail):
+ attrs = {key: value if value is not None else key for key, value in attrs}
+ if "markdown" in attrs:
+ attrs.pop('markdown')
+ element = etree.Element(tag, attrs)
+ data = etree.tostring(element, encoding='unicode', method='html')
+ else:
+ data = self.get_starttag_text()
+ self.handle_empty_tag(data, True)
+ return
+
+ if tag in self.block_level_tags and (self.at_line_start() or self.intail):
+ # Valueless attr (ex: `<tag checked>`) results in `[('checked', None)]`.
+ # Convert to `{'checked': 'checked'}`.
+ attrs = {key: value if value is not None else key for key, value in attrs}
+ state = self.get_state(tag, attrs)
+ if self.inraw or (state in [None, 'off'] and not self.mdstack):
+ # fall back to default behavior
+ attrs.pop('markdown', None)
+ super().handle_starttag(tag, attrs)
+ else:
+ if 'p' in self.mdstack and tag in self.block_level_tags:
+ # Close unclosed 'p' tag
+ self.handle_endtag('p')
+ self.mdstate.append(state)
+ self.mdstack.append(tag)
+ attrs['markdown'] = state
+ self.treebuilder.start(tag, attrs)
+ else:
+ # Span level tag
+ if self.inraw:
+ super().handle_starttag(tag, attrs)
+ else:
+ text = self.get_starttag_text()
+ if self.mdstate and self.mdstate[-1] == "off":
+ self.handle_data(self.md.htmlStash.store(text))
+ else:
+ self.handle_data(text)
+ if tag in self.CDATA_CONTENT_ELEMENTS:
+ # This is presumably a standalone tag in a code span (see #1036).
+ self.clear_cdata_mode()
+
+ def handle_endtag(self, tag):
+ if tag in self.block_level_tags:
+ if self.inraw:
+ super().handle_endtag(tag)
+ elif tag in self.mdstack:
+ # Close element and any unclosed children
+ while self.mdstack:
+ item = self.mdstack.pop()
+ self.mdstate.pop()
+ self.treebuilder.end(item)
+ if item == tag:
+ break
+ if not self.mdstack:
+ # Last item in stack is closed. Stash it
+ element = self.get_element()
+ # Get last entry to see if it ends in newlines
+ # If it is an element, assume there is no newlines
+ item = self.cleandoc[-1] if self.cleandoc else ''
+ # If we only have one newline before block element, add another
+ if not item.endswith('\n\n') and item.endswith('\n'):
+ self.cleandoc.append('\n')
+ self.cleandoc.append(self.md.htmlStash.store(element))
+ self.cleandoc.append('\n\n')
+ self.state = []
+ # Check if element has a tail
+ if not blank_line_re.match(
+ self.rawdata[self.line_offset + self.offset + len(self.get_endtag_text(tag)):]):
+ # More content exists after endtag.
+ self.intail = True
+ else:
+ # Treat orphan closing tag as a span level tag.
+ text = self.get_endtag_text(tag)
+ if self.mdstate and self.mdstate[-1] == "off":
+ self.handle_data(self.md.htmlStash.store(text))
+ else:
+ self.handle_data(text)
+ else:
+ # Span level tag
+ if self.inraw:
+ super().handle_endtag(tag)
+ else:
+ text = self.get_endtag_text(tag)
+ if self.mdstate and self.mdstate[-1] == "off":
+ self.handle_data(self.md.htmlStash.store(text))
+ else:
+ self.handle_data(text)
+
+ def handle_startendtag(self, tag, attrs):
+ if tag in self.empty_tags:
+ attrs = {key: value if value is not None else key for key, value in attrs}
+ if "markdown" in attrs:
+ attrs.pop('markdown')
+ element = etree.Element(tag, attrs)
+ data = etree.tostring(element, encoding='unicode', method='html')
+ else:
+ data = self.get_starttag_text()
+ else:
+ data = self.get_starttag_text()
+ self.handle_empty_tag(data, is_block=self.md.is_block_level(tag))
+
+ def handle_data(self, data):
+ if self.intail and '\n' in data:
+ self.intail = False
+ if self.inraw or not self.mdstack:
+ super().handle_data(data)
+ else:
+ self.treebuilder.data(data)
+
+ def handle_empty_tag(self, data, is_block):
+ if self.inraw or not self.mdstack:
+ super().handle_empty_tag(data, is_block)
+ else:
+ if self.at_line_start() and is_block:
+ self.handle_data('\n' + self.md.htmlStash.store(data) + '\n\n')
+ else:
+ self.handle_data(self.md.htmlStash.store(data))
+
+ def parse_pi(self, i):
+ if self.at_line_start() or self.intail or self.mdstack:
+ # The same override exists in HTMLExtractor without the check
+ # for mdstack. Therefore, use HTMLExtractor's parent instead.
+ return super(HTMLExtractor, self).parse_pi(i)
+ # This is not the beginning of a raw block so treat as plain data
+ # and avoid consuming any tags which may follow (see #1066).
+ self.handle_data('<?')
+ return i + 2
+
+ def parse_html_declaration(self, i):
+ if self.at_line_start() or self.intail or self.mdstack:
+ # The same override exists in HTMLExtractor without the check
+ # for mdstack. Therefore, use HTMLExtractor's parent instead.
+ return super(HTMLExtractor, self).parse_html_declaration(i)
+ # This is not the beginning of a raw block so treat as plain data
+ # and avoid consuming any tags which may follow (see #1066).
+ self.handle_data('<!')
+ return i + 2
+
+
+class HtmlBlockPreprocessor(Preprocessor):
+ """Remove html blocks from the text and store them for later retrieval."""
+
+ def run(self, lines):
+ source = '\n'.join(lines)
+ parser = HTMLExtractorExtra(self.md)
+ parser.feed(source)
+ parser.close()
+ return ''.join(parser.cleandoc).split('\n')
+
+
+class MarkdownInHtmlProcessor(BlockProcessor):
+ """Process Markdown Inside HTML Blocks which have been stored in the HtmlStash."""
+
+ def test(self, parent, block):
+ # ALways return True. `run` will return `False` it not a valid match.
+ return True
+
+ def parse_element_content(self, element):
+ """
+ Recursively parse the text content of an etree Element as Markdown.
+
+ Any block level elements generated from the Markdown will be inserted as children of the element in place
+ of the text content. All `markdown` attributes are removed. For any elements in which Markdown parsing has
+ been disabled, the text content of it and its chidlren are wrapped in an `AtomicString`.
+ """
+
+ md_attr = element.attrib.pop('markdown', 'off')
+
+ if md_attr == 'block':
+ # Parse content as block level
+ # The order in which the different parts are parsed (text, children, tails) is important here as the
+ # order of elements needs to be preserved. We can't be inserting items at a later point in the current
+ # iteration as we don't want to do raw processing on elements created from parsing Markdown text (for
+ # example). Therefore, the order of operations is children, tails, text.
+
+ # Recursively parse existing children from raw HTML
+ for child in list(element):
+ self.parse_element_content(child)
+
+ # Parse Markdown text in tail of children. Do this separate to avoid raw HTML parsing.
+ # Save the position of each item to be inserted later in reverse.
+ tails = []
+ for pos, child in enumerate(element):
+ if child.tail:
+ block = child.tail.rstrip('\n')
+ child.tail = ''
+ # Use a dummy placeholder element.
+ dummy = etree.Element('div')
+ self.parser.parseBlocks(dummy, block.split('\n\n'))
+ children = list(dummy)
+ children.reverse()
+ tails.append((pos + 1, children))
+
+ # Insert the elements created from the tails in reverse.
+ tails.reverse()
+ for pos, tail in tails:
+ for item in tail:
+ element.insert(pos, item)
+
+ # Parse Markdown text content. Do this last to avoid raw HTML parsing.
+ if element.text:
+ block = element.text.rstrip('\n')
+ element.text = ''
+ # Use a dummy placeholder element as the content needs to get inserted before existing children.
+ dummy = etree.Element('div')
+ self.parser.parseBlocks(dummy, block.split('\n\n'))
+ children = list(dummy)
+ children.reverse()
+ for child in children:
+ element.insert(0, child)
+
+ elif md_attr == 'span':
+ # Span level parsing will be handled by inlineprocessors.
+ # Walk children here to remove any `markdown` attributes.
+ for child in list(element):
+ self.parse_element_content(child)
+
+ else:
+ # Disable inline parsing for everything else
+ if element.text is None:
+ element.text = ''
+ element.text = util.AtomicString(element.text)
+ for child in list(element):
+ self.parse_element_content(child)
+ if child.tail:
+ child.tail = util.AtomicString(child.tail)
+
+ def run(self, parent, blocks):
+ m = util.HTML_PLACEHOLDER_RE.match(blocks[0])
+ if m:
+ index = int(m.group(1))
+ element = self.parser.md.htmlStash.rawHtmlBlocks[index]
+ if isinstance(element, etree.Element):
+ # We have a matched element. Process it.
+ blocks.pop(0)
+ self.parse_element_content(element)
+ parent.append(element)
+ # Cleanup stash. Replace element with empty string to avoid confusing postprocessor.
+ self.parser.md.htmlStash.rawHtmlBlocks.pop(index)
+ self.parser.md.htmlStash.rawHtmlBlocks.insert(index, '')
+ # Confirm the match to the blockparser.
+ return True
+ # No match found.
+ return False
+
+
+class MarkdownInHTMLPostprocessor(RawHtmlPostprocessor):
+ def stash_to_string(self, text):
+ """ Override default to handle any etree elements still in the stash. """
+ if isinstance(text, etree.Element):
+ return self.md.serializer(text)
+ else:
+ return str(text)
+
+
+class MarkdownInHtmlExtension(Extension):
+ """Add Markdown parsing in HTML to Markdown class."""
+
+ def extendMarkdown(self, md):
+ """ Register extension instances. """
+
+ # Replace raw HTML preprocessor
+ md.preprocessors.register(HtmlBlockPreprocessor(md), 'html_block', 20)
+ # Add blockprocessor which handles the placeholders for etree elements
+ md.parser.blockprocessors.register(
+ MarkdownInHtmlProcessor(md.parser), 'markdown_block', 105
+ )
+ # Replace raw HTML postprocessor
+ md.postprocessors.register(MarkdownInHTMLPostprocessor(md), 'raw_html', 30)
+
+
+def makeExtension(**kwargs): # pragma: no cover
+ return MarkdownInHtmlExtension(**kwargs)
diff --git a/markdown/extensions/meta.py b/markdown/extensions/meta.py
index 1b555b2..10dee11 100644
--- a/markdown/extensions/meta.py
+++ b/markdown/extensions/meta.py
@@ -1,75 +1,68 @@
-#!usr/bin/python
-
"""
Meta Data Extension for Python-Markdown
=======================================
This extension adds Meta Data handling to markdown.
-Basic Usage:
-
- >>> import markdown
- >>> text = '''Title: A Test Doc.
- ... Author: Waylan Limberg
- ... John Doe
- ... Blank_Data:
- ...
- ... The body. This is paragraph one.
- ... '''
- >>> md = markdown.Markdown(['meta'])
- >>> md.convert(text)
- u'<p>The body. This is paragraph one.</p>'
- >>> md.Meta
- {u'blank_data': [u''], u'author': [u'Waylan Limberg', u'John Doe'], u'title': [u'A Test Doc.']}
-
-Make sure text without Meta Data still works (markdown < 1.6b returns a <p>).
+See <https://Python-Markdown.github.io/extensions/meta_data>
+for documentation.
- >>> text = ' Some Code - not extra lines of meta data.'
- >>> md = markdown.Markdown(['meta'])
- >>> md.convert(text)
- u'<pre><code>Some Code - not extra lines of meta data.\\n</code></pre>'
- >>> md.Meta
- {}
+Original code Copyright 2007-2008 [Waylan Limberg](http://achinghead.com).
-Copyright 2007-2008 [Waylan Limberg](http://achinghead.com).
+All changes Copyright 2008-2014 The Python Markdown Project
-Project website: <http://www.freewisdom.org/project/python-markdown/Meta-Data>
-Contact: markdown@freewisdom.org
-
-License: BSD (see ../docs/LICENSE for details)
+License: [BSD](https://opensource.org/licenses/bsd-license.php)
"""
-import markdown, re
+from . import Extension
+from ..preprocessors import Preprocessor
+import re
+import logging
+
+log = logging.getLogger('MARKDOWN')
# Global Vars
META_RE = re.compile(r'^[ ]{0,3}(?P<key>[A-Za-z0-9_-]+):\s*(?P<value>.*)')
META_MORE_RE = re.compile(r'^[ ]{4,}(?P<value>.*)')
+BEGIN_RE = re.compile(r'^-{3}(\s.*)?')
+END_RE = re.compile(r'^(-{3}|\.{3})(\s.*)?')
+
-class MetaExtension (markdown.Extension):
+class MetaExtension (Extension):
""" Meta-Data extension for Python-Markdown. """
- def extendMarkdown(self, md, md_globals):
+ def extendMarkdown(self, md):
""" Add MetaPreprocessor to Markdown instance. """
+ md.registerExtension(self)
+ self.md = md
+ md.preprocessors.register(MetaPreprocessor(md), 'meta', 27)
- md.preprocessors.add("meta", MetaPreprocessor(md), "_begin")
+ def reset(self):
+ self.md.Meta = {}
-class MetaPreprocessor(markdown.preprocessors.Preprocessor):
+class MetaPreprocessor(Preprocessor):
""" Get Meta-Data. """
def run(self, lines):
""" Parse Meta-Data and store in Markdown.Meta. """
meta = {}
key = None
- while 1:
+ if lines and BEGIN_RE.match(lines[0]):
+ lines.pop(0)
+ while lines:
line = lines.pop(0)
- if line.strip() == '':
- break # blank line - done
m1 = META_RE.match(line)
+ if line.strip() == '' or END_RE.match(line):
+ break # blank line or end of YAML header - done
if m1:
key = m1.group('key').lower().strip()
- meta[key] = [m1.group('value').strip()]
+ value = m1.group('value').strip()
+ try:
+ meta[key].append(value)
+ except KeyError:
+ meta[key] = [value]
else:
m2 = META_MORE_RE.match(line)
if m2 and key:
@@ -77,14 +70,10 @@ class MetaPreprocessor(markdown.preprocessors.Preprocessor):
meta[key].append(m2.group('value').strip())
else:
lines.insert(0, line)
- break # no meta data - done
- self.markdown.Meta = meta
+ break # no meta data - done
+ self.md.Meta = meta
return lines
-
-def makeExtension(configs={}):
- return MetaExtension(configs=configs)
-if __name__ == "__main__":
- import doctest
- doctest.testmod()
+def makeExtension(**kwargs): # pragma: no cover
+ return MetaExtension(**kwargs)
diff --git a/markdown/extensions/nl2br.py b/markdown/extensions/nl2br.py
new file mode 100644
index 0000000..6c7491b
--- /dev/null
+++ b/markdown/extensions/nl2br.py
@@ -0,0 +1,33 @@
+"""
+NL2BR Extension
+===============
+
+A Python-Markdown extension to treat newlines as hard breaks; like
+GitHub-flavored Markdown does.
+
+See <https://Python-Markdown.github.io/extensions/nl2br>
+for documentation.
+
+Oringinal code Copyright 2011 [Brian Neal](https://deathofagremmie.com/)
+
+All changes Copyright 2011-2014 The Python Markdown Project
+
+License: [BSD](https://opensource.org/licenses/bsd-license.php)
+
+"""
+
+from . import Extension
+from ..inlinepatterns import SubstituteTagInlineProcessor
+
+BR_RE = r'\n'
+
+
+class Nl2BrExtension(Extension):
+
+ def extendMarkdown(self, md):
+ br_tag = SubstituteTagInlineProcessor(BR_RE, 'br')
+ md.inlinePatterns.register(br_tag, 'nl', 5)
+
+
+def makeExtension(**kwargs): # pragma: no cover
+ return Nl2BrExtension(**kwargs)
diff --git a/markdown/extensions/rss.py b/markdown/extensions/rss.py
deleted file mode 100644
index 1274da2..0000000
--- a/markdown/extensions/rss.py
+++ /dev/null
@@ -1,114 +0,0 @@
-import markdown
-from markdown import etree
-
-DEFAULT_URL = "http://www.freewisdom.org/projects/python-markdown/"
-DEFAULT_CREATOR = "Yuri Takhteyev"
-DEFAULT_TITLE = "Markdown in Python"
-GENERATOR = "http://www.freewisdom.org/projects/python-markdown/markdown2rss"
-
-month_map = { "Jan" : "01",
- "Feb" : "02",
- "March" : "03",
- "April" : "04",
- "May" : "05",
- "June" : "06",
- "July" : "07",
- "August" : "08",
- "September" : "09",
- "October" : "10",
- "November" : "11",
- "December" : "12" }
-
-def get_time(heading):
-
- heading = heading.split("-")[0]
- heading = heading.strip().replace(",", " ").replace(".", " ")
-
- month, date, year = heading.split()
- month = month_map[month]
-
- return rdftime(" ".join((month, date, year, "12:00:00 AM")))
-
-def rdftime(time):
-
- time = time.replace(":", " ")
- time = time.replace("/", " ")
- time = time.split()
- return "%s-%s-%sT%s:%s:%s-08:00" % (time[0], time[1], time[2],
- time[3], time[4], time[5])
-
-
-def get_date(text):
- return "date"
-
-class RssExtension (markdown.Extension):
-
- def extendMarkdown(self, md, md_globals):
-
- self.config = { 'URL' : [DEFAULT_URL, "Main URL"],
- 'CREATOR' : [DEFAULT_CREATOR, "Feed creator's name"],
- 'TITLE' : [DEFAULT_TITLE, "Feed title"] }
-
- md.xml_mode = True
-
- # Insert a tree-processor that would actually add the title tag
- treeprocessor = RssTreeProcessor(md)
- treeprocessor.ext = self
- md.treeprocessors['rss'] = treeprocessor
- md.stripTopLevelTags = 0
- md.docType = '<?xml version="1.0" encoding="utf-8"?>\n'
-
-class RssTreeProcessor(markdown.treeprocessors.Treeprocessor):
-
- def run (self, root):
-
- rss = etree.Element("rss")
- rss.set("version", "2.0")
-
- channel = etree.SubElement(rss, "channel")
-
- for tag, text in (("title", self.ext.getConfig("TITLE")),
- ("link", self.ext.getConfig("URL")),
- ("description", None)):
-
- element = etree.SubElement(channel, tag)
- element.text = text
-
- for child in root:
-
- if child.tag in ["h1", "h2", "h3", "h4", "h5"]:
-
- heading = child.text.strip()
- item = etree.SubElement(channel, "item")
- link = etree.SubElement(item, "link")
- link.text = self.ext.getConfig("URL")
- title = etree.SubElement(item, "title")
- title.text = heading
-
- guid = ''.join([x for x in heading if x.isalnum()])
- guidElem = etree.SubElement(item, "guid")
- guidElem.text = guid
- guidElem.set("isPermaLink", "false")
-
- elif child.tag in ["p"]:
- try:
- description = etree.SubElement(item, "description")
- except UnboundLocalError:
- # Item not defined - moving on
- pass
- else:
- if len(child):
- content = "\n".join([etree.tostring(node)
- for node in child])
- else:
- content = child.text
- pholder = self.markdown.htmlStash.store(
- "<![CDATA[ %s]]>" % content)
- description.text = pholder
-
- return rss
-
-
-def makeExtension(configs):
-
- return RssExtension(configs)
diff --git a/markdown/extensions/sane_lists.py b/markdown/extensions/sane_lists.py
new file mode 100644
index 0000000..e27eb18
--- /dev/null
+++ b/markdown/extensions/sane_lists.py
@@ -0,0 +1,54 @@
+"""
+Sane List Extension for Python-Markdown
+=======================================
+
+Modify the behavior of Lists in Python-Markdown to act in a sane manor.
+
+See <https://Python-Markdown.github.io/extensions/sane_lists>
+for documentation.
+
+Original code Copyright 2011 [Waylan Limberg](http://achinghead.com)
+
+All changes Copyright 2011-2014 The Python Markdown Project
+
+License: [BSD](https://opensource.org/licenses/bsd-license.php)
+
+"""
+
+from . import Extension
+from ..blockprocessors import OListProcessor, UListProcessor
+import re
+
+
+class SaneOListProcessor(OListProcessor):
+
+ SIBLING_TAGS = ['ol']
+ LAZY_OL = False
+
+ def __init__(self, parser):
+ super().__init__(parser)
+ self.CHILD_RE = re.compile(r'^[ ]{0,%d}((\d+\.))[ ]+(.*)' %
+ (self.tab_length - 1))
+
+
+class SaneUListProcessor(UListProcessor):
+
+ SIBLING_TAGS = ['ul']
+
+ def __init__(self, parser):
+ super().__init__(parser)
+ self.CHILD_RE = re.compile(r'^[ ]{0,%d}(([*+-]))[ ]+(.*)' %
+ (self.tab_length - 1))
+
+
+class SaneListExtension(Extension):
+ """ Add sane lists to Markdown. """
+
+ def extendMarkdown(self, md):
+ """ Override existing Processors. """
+ md.parser.blockprocessors.register(SaneOListProcessor(md.parser), 'olist', 40)
+ md.parser.blockprocessors.register(SaneUListProcessor(md.parser), 'ulist', 30)
+
+
+def makeExtension(**kwargs): # pragma: no cover
+ return SaneListExtension(**kwargs)
diff --git a/markdown/extensions/smarty.py b/markdown/extensions/smarty.py
new file mode 100644
index 0000000..c4bfd58
--- /dev/null
+++ b/markdown/extensions/smarty.py
@@ -0,0 +1,257 @@
+'''
+Smarty extension for Python-Markdown
+====================================
+
+Adds conversion of ASCII dashes, quotes and ellipses to their HTML
+entity equivalents.
+
+See <https://Python-Markdown.github.io/extensions/smarty>
+for documentation.
+
+Author: 2013, Dmitry Shachnev <mitya57@gmail.com>
+
+All changes Copyright 2013-2014 The Python Markdown Project
+
+License: [BSD](https://opensource.org/licenses/bsd-license.php)
+
+SmartyPants license:
+
+ Copyright (c) 2003 John Gruber <https://daringfireball.net/>
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+
+ * Neither the name "SmartyPants" nor the names of its contributors
+ may be used to endorse or promote products derived from this
+ software without specific prior written permission.
+
+ This software is provided by the copyright holders and contributors "as
+ is" and any express or implied warranties, including, but not limited
+ to, the implied warranties of merchantability and fitness for a
+ particular purpose are disclaimed. In no event shall the copyright
+ owner or contributors be liable for any direct, indirect, incidental,
+ special, exemplary, or consequential damages (including, but not
+ limited to, procurement of substitute goods or services; loss of use,
+ data, or profits; or business interruption) however caused and on any
+ theory of liability, whether in contract, strict liability, or tort
+ (including negligence or otherwise) arising in any way out of the use
+ of this software, even if advised of the possibility of such damage.
+
+
+smartypants.py license:
+
+ smartypants.py is a derivative work of SmartyPants.
+ Copyright (c) 2004, 2007 Chad Miller <http://web.chad.org/>
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+
+ This software is provided by the copyright holders and contributors "as
+ is" and any express or implied warranties, including, but not limited
+ to, the implied warranties of merchantability and fitness for a
+ particular purpose are disclaimed. In no event shall the copyright
+ owner or contributors be liable for any direct, indirect, incidental,
+ special, exemplary, or consequential damages (including, but not
+ limited to, procurement of substitute goods or services; loss of use,
+ data, or profits; or business interruption) however caused and on any
+ theory of liability, whether in contract, strict liability, or tort
+ (including negligence or otherwise) arising in any way out of the use
+ of this software, even if advised of the possibility of such damage.
+
+'''
+
+
+from . import Extension
+from ..inlinepatterns import HtmlInlineProcessor, HTML_RE
+from ..treeprocessors import InlineProcessor
+from ..util import Registry
+
+
+# Constants for quote education.
+punctClass = r"""[!"#\$\%'()*+,-.\/:;<=>?\@\[\\\]\^_`{|}~]"""
+endOfWordClass = r"[\s.,;:!?)]"
+closeClass = r"[^\ \t\r\n\[\{\(\-\u0002\u0003]"
+
+openingQuotesBase = (
+ r'(\s' # a whitespace char
+ r'|&nbsp;' # or a non-breaking space entity
+ r'|--' # or dashes
+ r'|–|—' # or unicode
+ r'|&[mn]dash;' # or named dash entities
+ r'|&#8211;|&#8212;' # or decimal entities
+ r')'
+)
+
+substitutions = {
+ 'mdash': '&mdash;',
+ 'ndash': '&ndash;',
+ 'ellipsis': '&hellip;',
+ 'left-angle-quote': '&laquo;',
+ 'right-angle-quote': '&raquo;',
+ 'left-single-quote': '&lsquo;',
+ 'right-single-quote': '&rsquo;',
+ 'left-double-quote': '&ldquo;',
+ 'right-double-quote': '&rdquo;',
+}
+
+
+# Special case if the very first character is a quote
+# followed by punctuation at a non-word-break. Close the quotes by brute force:
+singleQuoteStartRe = r"^'(?=%s\B)" % punctClass
+doubleQuoteStartRe = r'^"(?=%s\B)' % punctClass
+
+# Special case for double sets of quotes, e.g.:
+# <p>He said, "'Quoted' words in a larger quote."</p>
+doubleQuoteSetsRe = r""""'(?=\w)"""
+singleQuoteSetsRe = r"""'"(?=\w)"""
+
+# Special case for decade abbreviations (the '80s):
+decadeAbbrRe = r"(?<!\w)'(?=\d{2}s)"
+
+# Get most opening double quotes:
+openingDoubleQuotesRegex = r'%s"(?=\w)' % openingQuotesBase
+
+# Double closing quotes:
+closingDoubleQuotesRegex = r'"(?=\s)'
+closingDoubleQuotesRegex2 = '(?<=%s)"' % closeClass
+
+# Get most opening single quotes:
+openingSingleQuotesRegex = r"%s'(?=\w)" % openingQuotesBase
+
+# Single closing quotes:
+closingSingleQuotesRegex = r"(?<=%s)'(?!\s|s\b|\d)" % closeClass
+closingSingleQuotesRegex2 = r"(?<=%s)'(\s|s\b)" % closeClass
+
+# All remaining quotes should be opening ones
+remainingSingleQuotesRegex = r"'"
+remainingDoubleQuotesRegex = r'"'
+
+HTML_STRICT_RE = HTML_RE + r'(?!\>)'
+
+
+class SubstituteTextPattern(HtmlInlineProcessor):
+ def __init__(self, pattern, replace, md):
+ """ Replaces matches with some text. """
+ HtmlInlineProcessor.__init__(self, pattern)
+ self.replace = replace
+ self.md = md
+
+ def handleMatch(self, m, data):
+ result = ''
+ for part in self.replace:
+ if isinstance(part, int):
+ result += m.group(part)
+ else:
+ result += self.md.htmlStash.store(part)
+ return result, m.start(0), m.end(0)
+
+
+class SmartyExtension(Extension):
+ def __init__(self, **kwargs):
+ self.config = {
+ 'smart_quotes': [True, 'Educate quotes'],
+ 'smart_angled_quotes': [False, 'Educate angled quotes'],
+ 'smart_dashes': [True, 'Educate dashes'],
+ 'smart_ellipses': [True, 'Educate ellipses'],
+ 'substitutions': [{}, 'Overwrite default substitutions'],
+ }
+ super().__init__(**kwargs)
+ self.substitutions = dict(substitutions)
+ self.substitutions.update(self.getConfig('substitutions', default={}))
+
+ def _addPatterns(self, md, patterns, serie, priority):
+ for ind, pattern in enumerate(patterns):
+ pattern += (md,)
+ pattern = SubstituteTextPattern(*pattern)
+ name = 'smarty-%s-%d' % (serie, ind)
+ self.inlinePatterns.register(pattern, name, priority-ind)
+
+ def educateDashes(self, md):
+ emDashesPattern = SubstituteTextPattern(
+ r'(?<!-)---(?!-)', (self.substitutions['mdash'],), md
+ )
+ enDashesPattern = SubstituteTextPattern(
+ r'(?<!-)--(?!-)', (self.substitutions['ndash'],), md
+ )
+ self.inlinePatterns.register(emDashesPattern, 'smarty-em-dashes', 50)
+ self.inlinePatterns.register(enDashesPattern, 'smarty-en-dashes', 45)
+
+ def educateEllipses(self, md):
+ ellipsesPattern = SubstituteTextPattern(
+ r'(?<!\.)\.{3}(?!\.)', (self.substitutions['ellipsis'],), md
+ )
+ self.inlinePatterns.register(ellipsesPattern, 'smarty-ellipses', 10)
+
+ def educateAngledQuotes(self, md):
+ leftAngledQuotePattern = SubstituteTextPattern(
+ r'\<\<', (self.substitutions['left-angle-quote'],), md
+ )
+ rightAngledQuotePattern = SubstituteTextPattern(
+ r'\>\>', (self.substitutions['right-angle-quote'],), md
+ )
+ self.inlinePatterns.register(leftAngledQuotePattern, 'smarty-left-angle-quotes', 40)
+ self.inlinePatterns.register(rightAngledQuotePattern, 'smarty-right-angle-quotes', 35)
+
+ def educateQuotes(self, md):
+ lsquo = self.substitutions['left-single-quote']
+ rsquo = self.substitutions['right-single-quote']
+ ldquo = self.substitutions['left-double-quote']
+ rdquo = self.substitutions['right-double-quote']
+ patterns = (
+ (singleQuoteStartRe, (rsquo,)),
+ (doubleQuoteStartRe, (rdquo,)),
+ (doubleQuoteSetsRe, (ldquo + lsquo,)),
+ (singleQuoteSetsRe, (lsquo + ldquo,)),
+ (decadeAbbrRe, (rsquo,)),
+ (openingSingleQuotesRegex, (1, lsquo)),
+ (closingSingleQuotesRegex, (rsquo,)),
+ (closingSingleQuotesRegex2, (rsquo, 1)),
+ (remainingSingleQuotesRegex, (lsquo,)),
+ (openingDoubleQuotesRegex, (1, ldquo)),
+ (closingDoubleQuotesRegex, (rdquo,)),
+ (closingDoubleQuotesRegex2, (rdquo,)),
+ (remainingDoubleQuotesRegex, (ldquo,))
+ )
+ self._addPatterns(md, patterns, 'quotes', 30)
+
+ def extendMarkdown(self, md):
+ configs = self.getConfigs()
+ self.inlinePatterns = Registry()
+ if configs['smart_ellipses']:
+ self.educateEllipses(md)
+ if configs['smart_quotes']:
+ self.educateQuotes(md)
+ if configs['smart_angled_quotes']:
+ self.educateAngledQuotes(md)
+ # Override HTML_RE from inlinepatterns.py so that it does not
+ # process tags with duplicate closing quotes.
+ md.inlinePatterns.register(HtmlInlineProcessor(HTML_STRICT_RE, md), 'html', 90)
+ if configs['smart_dashes']:
+ self.educateDashes(md)
+ inlineProcessor = InlineProcessor(md)
+ inlineProcessor.inlinePatterns = self.inlinePatterns
+ md.treeprocessors.register(inlineProcessor, 'smarty', 2)
+ md.ESCAPED_CHARS.extend(['"', "'"])
+
+
+def makeExtension(**kwargs): # pragma: no cover
+ return SmartyExtension(**kwargs)
diff --git a/markdown/extensions/tables.py b/markdown/extensions/tables.py
index 1d3c920..c8b1024 100644
--- a/markdown/extensions/tables.py
+++ b/markdown/extensions/tables.py
@@ -1,44 +1,88 @@
-#!/usr/bin/env Python
"""
Tables Extension for Python-Markdown
====================================
Added parsing of tables to Python-Markdown.
-A simple example:
+See <https://Python-Markdown.github.io/extensions/tables>
+for documentation.
- First Header | Second Header
- ------------- | -------------
- Content Cell | Content Cell
- Content Cell | Content Cell
+Original code Copyright 2009 [Waylan Limberg](http://achinghead.com)
+
+All changes Copyright 2008-2014 The Python Markdown Project
+
+License: [BSD](https://opensource.org/licenses/bsd-license.php)
-Copyright 2009 - [Waylan Limberg](http://achinghead.com)
"""
-import markdown
-from markdown import etree
+
+from . import Extension
+from ..blockprocessors import BlockProcessor
+import xml.etree.ElementTree as etree
+import re
+PIPE_NONE = 0
+PIPE_LEFT = 1
+PIPE_RIGHT = 2
-class TableProcessor(markdown.blockprocessors.BlockProcessor):
+class TableProcessor(BlockProcessor):
""" Process Tables. """
+ RE_CODE_PIPES = re.compile(r'(?:(\\\\)|(\\`+)|(`+)|(\\\|)|(\|))')
+ RE_END_BORDER = re.compile(r'(?<!\\)(?:\\\\)*\|$')
+
+ def __init__(self, parser, config):
+ self.border = False
+ self.separator = ''
+ self.config = config
+
+ super().__init__(parser)
+
def test(self, parent, block):
- rows = block.split('\n')
- return (len(rows) > 2 and '|' in rows[0] and
- '|' in rows[1] and '-' in rows[1] and
- rows[1][0] in ['|', ':', '-'])
+ """
+ Ensure first two rows (column header and separator row) are valid table rows.
+
+ Keep border check and separator row do avoid repeating the work.
+ """
+ is_table = False
+ rows = [row.strip(' ') for row in block.split('\n')]
+ if len(rows) > 1:
+ header0 = rows[0]
+ self.border = PIPE_NONE
+ if header0.startswith('|'):
+ self.border |= PIPE_LEFT
+ if self.RE_END_BORDER.search(header0) is not None:
+ self.border |= PIPE_RIGHT
+ row = self._split_row(header0)
+ row0_len = len(row)
+ is_table = row0_len > 1
+
+ # Each row in a single column table needs at least one pipe.
+ if not is_table and row0_len == 1 and self.border:
+ for index in range(1, len(rows)):
+ is_table = rows[index].startswith('|')
+ if not is_table:
+ is_table = self.RE_END_BORDER.search(rows[index]) is not None
+ if not is_table:
+ break
+
+ if is_table:
+ row = self._split_row(rows[1])
+ is_table = (len(row) == row0_len) and set(''.join(row)) <= set('|:- ')
+ if is_table:
+ self.separator = row
+
+ return is_table
def run(self, parent, blocks):
""" Parse a table block and build table. """
block = blocks.pop(0).split('\n')
- header = block[:2]
- rows = block[2:]
- # Get format type (bordered by pipes or not)
- border = False
- if header[0].startswith('|'):
- border = True
+ header = block[0].strip(' ')
+ rows = [] if len(block) < 3 else block[2:]
+
# Get alignment of columns
align = []
- for c in self._split_row(header[1], border):
+ for c in self.separator:
+ c = c.strip(' ')
if c.startswith(':') and c.endswith(':'):
align.append('center')
elif c.startswith(':'):
@@ -47,51 +91,146 @@ class TableProcessor(markdown.blockprocessors.BlockProcessor):
align.append('right')
else:
align.append(None)
+
# Build table
table = etree.SubElement(parent, 'table')
thead = etree.SubElement(table, 'thead')
- self._build_row(header[0], thead, align, border)
+ self._build_row(header, thead, align)
tbody = etree.SubElement(table, 'tbody')
- for row in rows:
- self._build_row(row, tbody, align, border)
+ if len(rows) == 0:
+ # Handle empty table
+ self._build_empty_row(tbody, align)
+ else:
+ for row in rows:
+ self._build_row(row.strip(' '), tbody, align)
+
+ def _build_empty_row(self, parent, align):
+ """Build an empty row."""
+ tr = etree.SubElement(parent, 'tr')
+ count = len(align)
+ while count:
+ etree.SubElement(tr, 'td')
+ count -= 1
- def _build_row(self, row, parent, align, border):
+ def _build_row(self, row, parent, align):
""" Given a row of text, build table cells. """
tr = etree.SubElement(parent, 'tr')
tag = 'td'
if parent.tag == 'thead':
tag = 'th'
- cells = self._split_row(row, border)
- # We use align here rather than cells to ensure every row
+ cells = self._split_row(row)
+ # We use align here rather than cells to ensure every row
# contains the same number of columns.
for i, a in enumerate(align):
c = etree.SubElement(tr, tag)
try:
- c.text = cells[i].strip()
- except IndexError:
+ c.text = cells[i].strip(' ')
+ except IndexError: # pragma: no cover
c.text = ""
if a:
- c.set('align', a)
+ if self.config['use_align_attribute']:
+ c.set('align', a)
+ else:
+ c.set('style', f'text-align: {a};')
- def _split_row(self, row, border):
+ def _split_row(self, row):
""" split a row of text into list of cells. """
- if border:
+ if self.border:
if row.startswith('|'):
row = row[1:]
- if row.endswith('|'):
- row = row[:-1]
- return row.split('|')
+ row = self.RE_END_BORDER.sub('', row)
+ return self._split(row)
+ def _split(self, row):
+ """ split a row of text with some code into a list of cells. """
+ elements = []
+ pipes = []
+ tics = []
+ tic_points = []
+ tic_region = []
+ good_pipes = []
+
+ # Parse row
+ # Throw out \\, and \|
+ for m in self.RE_CODE_PIPES.finditer(row):
+ # Store ` data (len, start_pos, end_pos)
+ if m.group(2):
+ # \`+
+ # Store length of each tic group: subtract \
+ tics.append(len(m.group(2)) - 1)
+ # Store start of group, end of group, and escape length
+ tic_points.append((m.start(2), m.end(2) - 1, 1))
+ elif m.group(3):
+ # `+
+ # Store length of each tic group
+ tics.append(len(m.group(3)))
+ # Store start of group, end of group, and escape length
+ tic_points.append((m.start(3), m.end(3) - 1, 0))
+ # Store pipe location
+ elif m.group(5):
+ pipes.append(m.start(5))
+
+ # Pair up tics according to size if possible
+ # Subtract the escape length *only* from the opening.
+ # Walk through tic list and see if tic has a close.
+ # Store the tic region (start of region, end of region).
+ pos = 0
+ tic_len = len(tics)
+ while pos < tic_len:
+ try:
+ tic_size = tics[pos] - tic_points[pos][2]
+ if tic_size == 0:
+ raise ValueError
+ index = tics[pos + 1:].index(tic_size) + 1
+ tic_region.append((tic_points[pos][0], tic_points[pos + index][1]))
+ pos += index + 1
+ except ValueError:
+ pos += 1
-class TableExtension(markdown.Extension):
+ # Resolve pipes. Check if they are within a tic pair region.
+ # Walk through pipes comparing them to each region.
+ # - If pipe position is less that a region, it isn't in a region
+ # - If it is within a region, we don't want it, so throw it out
+ # - If we didn't throw it out, it must be a table pipe
+ for pipe in pipes:
+ throw_out = False
+ for region in tic_region:
+ if pipe < region[0]:
+ # Pipe is not in a region
+ break
+ elif region[0] <= pipe <= region[1]:
+ # Pipe is within a code region. Throw it out.
+ throw_out = True
+ break
+ if not throw_out:
+ good_pipes.append(pipe)
+
+ # Split row according to table delimiters.
+ pos = 0
+ for pipe in good_pipes:
+ elements.append(row[pos:pipe])
+ pos = pipe + 1
+ elements.append(row[pos:])
+ return elements
+
+
+class TableExtension(Extension):
""" Add tables to Markdown. """
- def extendMarkdown(self, md, md_globals):
+ def __init__(self, **kwargs):
+ self.config = {
+ 'use_align_attribute': [False, 'True to use align attribute instead of style.'],
+ }
+
+ super().__init__(**kwargs)
+
+ def extendMarkdown(self, md):
""" Add an instance of TableProcessor to BlockParser. """
- md.parser.blockprocessors.add('table',
- TableProcessor(md.parser),
- '<hashheader')
+ if '|' not in md.ESCAPED_CHARS:
+ md.ESCAPED_CHARS.append('|')
+ processor = TableProcessor(md.parser, self.getConfigs())
+ md.parser.blockprocessors.register(processor, 'table', 75)
-def makeExtension(configs={}):
- return TableExtension(configs=configs)
+def makeExtension(**kwargs): # pragma: no cover
+ return TableExtension(**kwargs)
diff --git a/markdown/extensions/toc.py b/markdown/extensions/toc.py
index 1d9489c..1ded18d 100644
--- a/markdown/extensions/toc.py
+++ b/markdown/extensions/toc.py
@@ -1,136 +1,384 @@
"""
Table of Contents Extension for Python-Markdown
-* * *
+===============================================
-(c) 2008 [Jack Miller](http://codezen.org)
+See <https://Python-Markdown.github.io/extensions/toc>
+for documentation.
-Dependencies:
-* [Markdown 2.0+](http://www.freewisdom.org/projects/python-markdown/)
+Oringinal code Copyright 2008 [Jack Miller](https://codezen.org/)
+
+All changes Copyright 2008-2014 The Python Markdown Project
+
+License: [BSD](https://opensource.org/licenses/bsd-license.php)
"""
-import markdown
-from markdown import etree
+
+from . import Extension
+from ..treeprocessors import Treeprocessor
+from ..util import code_escape, parseBoolValue, AMP_SUBSTITUTE, HTML_PLACEHOLDER_RE, AtomicString
+from ..treeprocessors import UnescapeTreeprocessor
import re
+import html
+import unicodedata
+import xml.etree.ElementTree as etree
-class TocTreeprocessor(markdown.treeprocessors.Treeprocessor):
- # Iterator wrapper to get parent and child all at once
- def iterparent(self, root):
- for parent in root.getiterator():
- for child in parent:
- yield parent, child
- def run(self, doc):
- div = etree.Element("div")
- div.attrib["class"] = "toc"
- last_li = None
+def slugify(value, separator, unicode=False):
+ """ Slugify a string, to make it URL friendly. """
+ if not unicode:
+ # Replace Extended Latin characters with ASCII, i.e. žlutý → zluty
+ value = unicodedata.normalize('NFKD', value)
+ value = value.encode('ascii', 'ignore').decode('ascii')
+ value = re.sub(r'[^\w\s-]', '', value).strip().lower()
+ return re.sub(r'[{}\s]+'.format(separator), separator, value)
- # Add title to the div
- if self.config["title"][0]:
- header = etree.SubElement(div, "span")
- header.attrib["class"] = "toctitle"
- header.text = self.config["title"][0]
- level = 0
- list_stack=[div]
- header_rgx = re.compile("[Hh][123456]")
+def slugify_unicode(value, separator):
+ """ Slugify a string, to make it URL friendly while preserving Unicode characters. """
+ return slugify(value, separator, unicode=True)
- # Get a list of id attributes
- used_ids = []
- for c in doc.getiterator():
- if "id" in c.attrib:
- used_ids.append(c.attrib["id"])
- for (p, c) in self.iterparent(doc):
- if not c.text:
+IDCOUNT_RE = re.compile(r'^(.*)_([0-9]+)$')
+
+
+def unique(id, ids):
+ """ Ensure id is unique in set of ids. Append '_1', '_2'... if not """
+ while id in ids or not id:
+ m = IDCOUNT_RE.match(id)
+ if m:
+ id = '%s_%d' % (m.group(1), int(m.group(2))+1)
+ else:
+ id = '%s_%d' % (id, 1)
+ ids.add(id)
+ return id
+
+
+def get_name(el):
+ """Get title name."""
+
+ text = []
+ for c in el.itertext():
+ if isinstance(c, AtomicString):
+ text.append(html.unescape(c))
+ else:
+ text.append(c)
+ return ''.join(text).strip()
+
+
+def stashedHTML2text(text, md, strip_entities=True):
+ """ Extract raw HTML from stash, reduce to plain text and swap with placeholder. """
+ def _html_sub(m):
+ """ Substitute raw html with plain text. """
+ try:
+ raw = md.htmlStash.rawHtmlBlocks[int(m.group(1))]
+ except (IndexError, TypeError): # pragma: no cover
+ return m.group(0)
+ # Strip out tags and/or entities - leaving text
+ res = re.sub(r'(<[^>]+>)', '', raw)
+ if strip_entities:
+ res = re.sub(r'(&[\#a-zA-Z0-9]+;)', '', res)
+ return res
+
+ return HTML_PLACEHOLDER_RE.sub(_html_sub, text)
+
+
+def unescape(text):
+ """ Unescape escaped text. """
+ c = UnescapeTreeprocessor()
+ return c.unescape(text)
+
+
+def nest_toc_tokens(toc_list):
+ """Given an unsorted list with errors and skips, return a nested one.
+ [{'level': 1}, {'level': 2}]
+ =>
+ [{'level': 1, 'children': [{'level': 2, 'children': []}]}]
+
+ A wrong list is also converted:
+ [{'level': 2}, {'level': 1}]
+ =>
+ [{'level': 2, 'children': []}, {'level': 1, 'children': []}]
+ """
+
+ ordered_list = []
+ if len(toc_list):
+ # Initialize everything by processing the first entry
+ last = toc_list.pop(0)
+ last['children'] = []
+ levels = [last['level']]
+ ordered_list.append(last)
+ parents = []
+
+ # Walk the rest nesting the entries properly
+ while toc_list:
+ t = toc_list.pop(0)
+ current_level = t['level']
+ t['children'] = []
+
+ # Reduce depth if current level < last item's level
+ if current_level < levels[-1]:
+ # Pop last level since we know we are less than it
+ levels.pop()
+
+ # Pop parents and levels we are less than or equal to
+ to_pop = 0
+ for p in reversed(parents):
+ if current_level <= p['level']:
+ to_pop += 1
+ else: # pragma: no cover
+ break
+ if to_pop:
+ levels = levels[:-to_pop]
+ parents = parents[:-to_pop]
+
+ # Note current level as last
+ levels.append(current_level)
+
+ # Level is the same, so append to
+ # the current parent (if available)
+ if current_level == levels[-1]:
+ (parents[-1]['children'] if parents
+ else ordered_list).append(t)
+
+ # Current level is > last item's level,
+ # So make last item a parent and append current as child
+ else:
+ last['children'].append(t)
+ parents.append(last)
+ levels.append(current_level)
+ last = t
+
+ return ordered_list
+
+
+class TocTreeprocessor(Treeprocessor):
+ def __init__(self, md, config):
+ super().__init__(md)
+
+ self.marker = config["marker"]
+ self.title = config["title"]
+ self.base_level = int(config["baselevel"]) - 1
+ self.slugify = config["slugify"]
+ self.sep = config["separator"]
+ self.toc_class = config["toc_class"]
+ self.use_anchors = parseBoolValue(config["anchorlink"])
+ self.anchorlink_class = config["anchorlink_class"]
+ self.use_permalinks = parseBoolValue(config["permalink"], False)
+ if self.use_permalinks is None:
+ self.use_permalinks = config["permalink"]
+ self.permalink_class = config["permalink_class"]
+ self.permalink_title = config["permalink_title"]
+ self.header_rgx = re.compile("[Hh][123456]")
+ if isinstance(config["toc_depth"], str) and '-' in config["toc_depth"]:
+ self.toc_top, self.toc_bottom = [int(x) for x in config["toc_depth"].split('-')]
+ else:
+ self.toc_top = 1
+ self.toc_bottom = int(config["toc_depth"])
+
+ def iterparent(self, node):
+ ''' Iterator wrapper to get allowed parent and child all at once. '''
+
+ # We do not allow the marker inside a header as that
+ # would causes an enless loop of placing a new TOC
+ # inside previously generated TOC.
+ for child in node:
+ if not self.header_rgx.match(child.tag) and child.tag not in ['pre', 'code']:
+ yield node, child
+ yield from self.iterparent(child)
+
+ def replace_marker(self, root, elem):
+ ''' Replace marker with elem. '''
+ for (p, c) in self.iterparent(root):
+ text = ''.join(c.itertext()).strip()
+ if not text:
continue
# To keep the output from screwing up the
# validation by putting a <div> inside of a <p>
# we actually replace the <p> in its entirety.
- # We do not allow the marker inside a header as that
- # would causes an enless loop of placing a new TOC
- # inside previously generated TOC.
- if c.text.find(self.config["marker"][0]) > -1 and not header_rgx.match(c.tag):
+ # The <p> element may contain more than a single text content
+ # (nl2br can introduce a <br>). In this situation, c.text returns
+ # the very first content, ignore children contents or tail content.
+ # len(c) == 0 is here to ensure there is only text in the <p>.
+ if c.text and c.text.strip() == self.marker and len(c) == 0:
for i in range(len(p)):
if p[i] == c:
- p[i] = div
+ p[i] = elem
break
-
- if header_rgx.match(c.tag):
- tag_level = int(c.tag[-1])
-
- while tag_level < level:
- list_stack.pop()
- level -= 1
-
- if tag_level > level:
- newlist = etree.Element("ul")
- if last_li:
- last_li.append(newlist)
- else:
- list_stack[-1].append(newlist)
- list_stack.append(newlist)
- level += 1
-
- # Do not override pre-existing ids
- if not "id" in c.attrib:
- id = self.config["slugify"][0](c.text)
- if id in used_ids:
- ctr = 1
- while "%s_%d" % (id, ctr) in used_ids:
- ctr += 1
- id = "%s_%d" % (id, ctr)
- used_ids.append(id)
- c.attrib["id"] = id
- else:
- id = c.attrib["id"]
+ def set_level(self, elem):
+ ''' Adjust header level according to base level. '''
+ level = int(elem.tag[-1]) + self.base_level
+ if level > 6:
+ level = 6
+ elem.tag = 'h%d' % level
+
+ def add_anchor(self, c, elem_id): # @ReservedAssignment
+ anchor = etree.Element("a")
+ anchor.text = c.text
+ anchor.attrib["href"] = "#" + elem_id
+ anchor.attrib["class"] = self.anchorlink_class
+ c.text = ""
+ for elem in c:
+ anchor.append(elem)
+ while len(c):
+ c.remove(c[0])
+ c.append(anchor)
+
+ def add_permalink(self, c, elem_id):
+ permalink = etree.Element("a")
+ permalink.text = ("%spara;" % AMP_SUBSTITUTE
+ if self.use_permalinks is True
+ else self.use_permalinks)
+ permalink.attrib["href"] = "#" + elem_id
+ permalink.attrib["class"] = self.permalink_class
+ if self.permalink_title:
+ permalink.attrib["title"] = self.permalink_title
+ c.append(permalink)
+
+ def build_toc_div(self, toc_list):
+ """ Return a string div given a toc list. """
+ div = etree.Element("div")
+ div.attrib["class"] = self.toc_class
+
+ # Add title to the div
+ if self.title:
+ header = etree.SubElement(div, "span")
+ header.attrib["class"] = "toctitle"
+ header.text = self.title
+
+ def build_etree_ul(toc_list, parent):
+ ul = etree.SubElement(parent, "ul")
+ for item in toc_list:
# List item link, to be inserted into the toc div
- last_li = etree.Element("li")
- link = etree.SubElement(last_li, "a")
- link.text = c.text
- link.attrib["href"] = '#' + id
-
- if int(self.config["anchorlink"][0]):
- anchor = etree.SubElement(c, "a")
- anchor.text = c.text
- anchor.attrib["href"] = "#" + id
- anchor.attrib["class"] = "toclink"
- c.text = ""
-
- list_stack[-1].append(last_li)
-
-class TocExtension(markdown.Extension):
- def __init__(self, configs):
- self.config = { "marker" : ["[TOC]",
- "Text to find and replace with Table of Contents -"
- "Defaults to \"[TOC]\""],
- "slugify" : [self.slugify,
- "Function to generate anchors based on header text-"
- "Defaults to a built in slugify function."],
- "title" : [None,
- "Title to insert into TOC <div> - "
- "Defaults to None"],
- "anchorlink" : [0,
- "1 if header should be a self link"
- "Defaults to 0"]}
-
- for key, value in configs:
- self.setConfig(key, value)
-
- # This is exactly the same as Django's slugify
- def slugify(self, value):
- """ Slugify a string, to make it URL friendly. """
- import unicodedata
- value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore')
- value = unicode(re.sub('[^\w\s-]', '', value).strip().lower())
- return re.sub('[-\s]+','-',value)
-
- def extendMarkdown(self, md, md_globals):
- tocext = TocTreeprocessor(md)
- tocext.config = self.config
- md.treeprocessors.add("toc", tocext, "_begin")
-
-def makeExtension(configs={}):
- return TocExtension(configs=configs)
+ li = etree.SubElement(ul, "li")
+ link = etree.SubElement(li, "a")
+ link.text = item.get('name', '')
+ link.attrib["href"] = '#' + item.get('id', '')
+ if item['children']:
+ build_etree_ul(item['children'], li)
+ return ul
+
+ build_etree_ul(toc_list, div)
+
+ if 'prettify' in self.md.treeprocessors:
+ self.md.treeprocessors['prettify'].run(div)
+
+ return div
+
+ def run(self, doc):
+ # Get a list of id attributes
+ used_ids = set()
+ for el in doc.iter():
+ if "id" in el.attrib:
+ used_ids.add(el.attrib["id"])
+
+ toc_tokens = []
+ for el in doc.iter():
+ if isinstance(el.tag, str) and self.header_rgx.match(el.tag):
+ self.set_level(el)
+ text = get_name(el)
+
+ # Do not override pre-existing ids
+ if "id" not in el.attrib:
+ innertext = unescape(stashedHTML2text(text, self.md))
+ el.attrib["id"] = unique(self.slugify(innertext, self.sep), used_ids)
+
+ if int(el.tag[-1]) >= self.toc_top and int(el.tag[-1]) <= self.toc_bottom:
+ toc_tokens.append({
+ 'level': int(el.tag[-1]),
+ 'id': el.attrib["id"],
+ 'name': stashedHTML2text(
+ code_escape(el.attrib.get('data-toc-label', text)),
+ self.md, strip_entities=False
+ )
+ })
+
+ # Remove the data-toc-label attribute as it is no longer needed
+ if 'data-toc-label' in el.attrib:
+ del el.attrib['data-toc-label']
+
+ if self.use_anchors:
+ self.add_anchor(el, el.attrib["id"])
+ if self.use_permalinks not in [False, None]:
+ self.add_permalink(el, el.attrib["id"])
+
+ toc_tokens = nest_toc_tokens(toc_tokens)
+ div = self.build_toc_div(toc_tokens)
+ if self.marker:
+ self.replace_marker(doc, div)
+
+ # serialize and attach to markdown instance.
+ toc = self.md.serializer(div)
+ for pp in self.md.postprocessors:
+ toc = pp.run(toc)
+ self.md.toc_tokens = toc_tokens
+ self.md.toc = toc
+
+
+class TocExtension(Extension):
+
+ TreeProcessorClass = TocTreeprocessor
+
+ def __init__(self, **kwargs):
+ self.config = {
+ "marker": ['[TOC]',
+ 'Text to find and replace with Table of Contents - '
+ 'Set to an empty string to disable. Defaults to "[TOC]"'],
+ "title": ["",
+ "Title to insert into TOC <div> - "
+ "Defaults to an empty string"],
+ "toc_class": ['toc',
+ 'CSS class(es) used for the link. '
+ 'Defaults to "toclink"'],
+ "anchorlink": [False,
+ "True if header should be a self link - "
+ "Defaults to False"],
+ "anchorlink_class": ['toclink',
+ 'CSS class(es) used for the link. '
+ 'Defaults to "toclink"'],
+ "permalink": [0,
+ "True or link text if a Sphinx-style permalink should "
+ "be added - Defaults to False"],
+ "permalink_class": ['headerlink',
+ 'CSS class(es) used for the link. '
+ 'Defaults to "headerlink"'],
+ "permalink_title": ["Permanent link",
+ "Title attribute of the permalink - "
+ "Defaults to 'Permanent link'"],
+ "baselevel": ['1', 'Base level for headers.'],
+ "slugify": [slugify,
+ "Function to generate anchors based on header text - "
+ "Defaults to the headerid ext's slugify function."],
+ 'separator': ['-', 'Word separator. Defaults to "-".'],
+ "toc_depth": [6,
+ 'Define the range of section levels to include in'
+ 'the Table of Contents. A single integer (b) defines'
+ 'the bottom section level (<h1>..<hb>) only.'
+ 'A string consisting of two digits separated by a hyphen'
+ 'in between ("2-5"), define the top (t) and the'
+ 'bottom (b) (<ht>..<hb>). Defaults to `6` (bottom).'],
+ }
+
+ super().__init__(**kwargs)
+
+ def extendMarkdown(self, md):
+ md.registerExtension(self)
+ self.md = md
+ self.reset()
+ tocext = self.TreeProcessorClass(md, self.getConfigs())
+ # Headerid ext is set to '>prettify'. With this set to '_end',
+ # it should always come after headerid ext (and honor ids assigned
+ # by the header id extension) if both are used. Same goes for
+ # attr_list extension. This must come last because we don't want
+ # to redefine ids after toc is created. But we do want toc prettified.
+ md.treeprocessors.register(tocext, 'toc', 5)
+
+ def reset(self):
+ self.md.toc = ''
+ self.md.toc_tokens = []
+
+
+def makeExtension(**kwargs): # pragma: no cover
+ return TocExtension(**kwargs)
diff --git a/markdown/extensions/wikilinks.py b/markdown/extensions/wikilinks.py
index df44e1c..cddee7a 100644
--- a/markdown/extensions/wikilinks.py
+++ b/markdown/extensions/wikilinks.py
@@ -1,155 +1,87 @@
-#!/usr/bin/env python
-
'''
WikiLinks Extension for Python-Markdown
======================================
-Converts [[WikiLinks]] to relative links. Requires Python-Markdown 2.0+
-
-Basic usage:
-
- >>> import markdown
- >>> text = "Some text with a [[WikiLink]]."
- >>> html = markdown.markdown(text, ['wikilinks'])
- >>> html
- u'<p>Some text with a <a class="wikilink" href="/WikiLink/">WikiLink</a>.</p>'
-
-Whitespace behavior:
-
- >>> markdown.markdown('[[ foo bar_baz ]]', ['wikilinks'])
- u'<p><a class="wikilink" href="/foo_bar_baz/">foo bar_baz</a></p>'
- >>> markdown.markdown('foo [[ ]] bar', ['wikilinks'])
- u'<p>foo bar</p>'
-
-To define custom settings the simple way:
-
- >>> markdown.markdown(text,
- ... ['wikilinks(base_url=/wiki/,end_url=.html,html_class=foo)']
- ... )
- u'<p>Some text with a <a class="foo" href="/wiki/WikiLink.html">WikiLink</a>.</p>'
-
-Custom settings the complex way:
-
- >>> md = markdown.Markdown(
- ... extensions = ['wikilinks'],
- ... extension_configs = {'wikilinks': [
- ... ('base_url', 'http://example.com/'),
- ... ('end_url', '.html'),
- ... ('html_class', '') ]},
- ... safe_mode = True)
- >>> md.convert(text)
- u'<p>Some text with a <a href="http://example.com/WikiLink.html">WikiLink</a>.</p>'
-
-Use MetaData with mdx_meta.py (Note the blank html_class in MetaData):
-
- >>> text = """wiki_base_url: http://example.com/
- ... wiki_end_url: .html
- ... wiki_html_class:
- ...
- ... Some text with a [[WikiLink]]."""
- >>> md = markdown.Markdown(extensions=['meta', 'wikilinks'])
- >>> md.convert(text)
- u'<p>Some text with a <a href="http://example.com/WikiLink.html">WikiLink</a>.</p>'
-
-MetaData should not carry over to next document:
+Converts [[WikiLinks]] to relative links.
- >>> md.convert("No [[MetaData]] here.")
- u'<p>No <a class="wikilink" href="/MetaData/">MetaData</a> here.</p>'
+See <https://Python-Markdown.github.io/extensions/wikilinks>
+for documentation.
-Define a custom URL builder:
+Original code Copyright [Waylan Limberg](http://achinghead.com/).
- >>> def my_url_builder(label, base, end):
- ... return '/bar/'
- >>> md = markdown.Markdown(extensions=['wikilinks'],
- ... extension_configs={'wikilinks' : [('build_url', my_url_builder)]})
- >>> md.convert('[[foo]]')
- u'<p><a class="wikilink" href="/bar/">foo</a></p>'
+All changes Copyright The Python Markdown Project
-From the command line:
+License: [BSD](https://opensource.org/licenses/bsd-license.php)
- python markdown.py -x wikilinks(base_url=http://example.com/,end_url=.html,html_class=foo) src.txt
-
-By [Waylan Limberg](http://achinghead.com/).
-
-License: [BSD](http://www.opensource.org/licenses/bsd-license.php)
-
-Dependencies:
-* [Python 2.3+](http://python.org)
-* [Markdown 2.0+](http://www.freewisdom.org/projects/python-markdown/)
'''
-import markdown
+from . import Extension
+from ..inlinepatterns import InlineProcessor
+import xml.etree.ElementTree as etree
import re
+
def build_url(label, base, end):
""" Build a url from the label, a base, and an end. """
clean_label = re.sub(r'([ ]+_)|(_[ ]+)|([ ]+)', '_', label)
- return '%s%s%s'% (base, clean_label, end)
+ return '{}{}{}'.format(base, clean_label, end)
+
+class WikiLinkExtension(Extension):
-class WikiLinkExtension(markdown.Extension):
- def __init__(self, configs):
- # set extension defaults
+ def __init__(self, **kwargs):
self.config = {
- 'base_url' : ['/', 'String to append to beginning or URL.'],
- 'end_url' : ['/', 'String to append to end of URL.'],
- 'html_class' : ['wikilink', 'CSS hook. Leave blank for none.'],
- 'build_url' : [build_url, 'Callable formats URL from label.'],
+ 'base_url': ['/', 'String to append to beginning or URL.'],
+ 'end_url': ['/', 'String to append to end of URL.'],
+ 'html_class': ['wikilink', 'CSS hook. Leave blank for none.'],
+ 'build_url': [build_url, 'Callable formats URL from label.'],
}
-
- # Override defaults with user settings
- for key, value in configs :
- self.setConfig(key, value)
-
- def extendMarkdown(self, md, md_globals):
+
+ super().__init__(**kwargs)
+
+ def extendMarkdown(self, md):
self.md = md
-
+
# append to end of inline patterns
- WIKILINK_RE = r'\[\[([A-Za-z0-9_ -]+)\]\]'
- wikilinkPattern = WikiLinks(WIKILINK_RE, self.config)
+ WIKILINK_RE = r'\[\[([\w0-9_ -]+)\]\]'
+ wikilinkPattern = WikiLinksInlineProcessor(WIKILINK_RE, self.getConfigs())
wikilinkPattern.md = md
- md.inlinePatterns.add('wikilink', wikilinkPattern, "<not_strong")
+ md.inlinePatterns.register(wikilinkPattern, 'wikilink', 75)
-class WikiLinks(markdown.inlinepatterns.Pattern):
+class WikiLinksInlineProcessor(InlineProcessor):
def __init__(self, pattern, config):
- markdown.inlinepatterns.Pattern.__init__(self, pattern)
+ super().__init__(pattern)
self.config = config
-
- def handleMatch(self, m):
- if m.group(2).strip():
+
+ def handleMatch(self, m, data):
+ if m.group(1).strip():
base_url, end_url, html_class = self._getMeta()
- label = m.group(2).strip()
- url = self.config['build_url'][0](label, base_url, end_url)
- a = markdown.etree.Element('a')
- a.text = label
+ label = m.group(1).strip()
+ url = self.config['build_url'](label, base_url, end_url)
+ a = etree.Element('a')
+ a.text = label
a.set('href', url)
if html_class:
a.set('class', html_class)
else:
a = ''
- return a
+ return a, m.start(0), m.end(0)
def _getMeta(self):
""" Return meta data or config data. """
- base_url = self.config['base_url'][0]
- end_url = self.config['end_url'][0]
- html_class = self.config['html_class'][0]
+ base_url = self.config['base_url']
+ end_url = self.config['end_url']
+ html_class = self.config['html_class']
if hasattr(self.md, 'Meta'):
- if self.md.Meta.has_key('wiki_base_url'):
+ if 'wiki_base_url' in self.md.Meta:
base_url = self.md.Meta['wiki_base_url'][0]
- if self.md.Meta.has_key('wiki_end_url'):
+ if 'wiki_end_url' in self.md.Meta:
end_url = self.md.Meta['wiki_end_url'][0]
- if self.md.Meta.has_key('wiki_html_class'):
+ if 'wiki_html_class' in self.md.Meta:
html_class = self.md.Meta['wiki_html_class'][0]
return base_url, end_url, html_class
-
-
-def makeExtension(configs=None) :
- return WikiLinkExtension(configs=configs)
-
-if __name__ == "__main__":
- import doctest
- doctest.testmod()
+def makeExtension(**kwargs): # pragma: no cover
+ return WikiLinkExtension(**kwargs)
diff --git a/markdown/html4.py b/markdown/html4.py
deleted file mode 100644
index 08f241d..0000000
--- a/markdown/html4.py
+++ /dev/null
@@ -1,274 +0,0 @@
-# markdown/html4.py
-#
-# Add html4 serialization to older versions of Elementree
-# Taken from ElementTree 1.3 preview with slight modifications
-#
-# Copyright (c) 1999-2007 by Fredrik Lundh. All rights reserved.
-#
-# fredrik@pythonware.com
-# http://www.pythonware.com
-#
-# --------------------------------------------------------------------
-# The ElementTree toolkit is
-#
-# Copyright (c) 1999-2007 by Fredrik Lundh
-#
-# By obtaining, using, and/or copying this software and/or its
-# associated documentation, you agree that you have read, understood,
-# and will comply with the following terms and conditions:
-#
-# Permission to use, copy, modify, and distribute this software and
-# its associated documentation for any purpose and without fee is
-# hereby granted, provided that the above copyright notice appears in
-# all copies, and that both that copyright notice and this permission
-# notice appear in supporting documentation, and that the name of
-# Secret Labs AB or the author not be used in advertising or publicity
-# pertaining to distribution of the software without specific, written
-# prior permission.
-#
-# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
-# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
-# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
-# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
-# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
-# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
-# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
-# OF THIS SOFTWARE.
-# --------------------------------------------------------------------
-
-
-import markdown
-ElementTree = markdown.etree.ElementTree
-QName = markdown.etree.QName
-Comment = markdown.etree.Comment
-PI = markdown.etree.PI
-ProcessingInstruction = markdown.etree.ProcessingInstruction
-
-HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",
- "img", "input", "isindex", "link", "meta" "param")
-
-try:
- HTML_EMPTY = set(HTML_EMPTY)
-except NameError:
- pass
-
-_namespace_map = {
- # "well-known" namespace prefixes
- "http://www.w3.org/XML/1998/namespace": "xml",
- "http://www.w3.org/1999/xhtml": "html",
- "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
- "http://schemas.xmlsoap.org/wsdl/": "wsdl",
- # xml schema
- "http://www.w3.org/2001/XMLSchema": "xs",
- "http://www.w3.org/2001/XMLSchema-instance": "xsi",
- # dublic core
- "http://purl.org/dc/elements/1.1/": "dc",
-}
-
-
-def _raise_serialization_error(text):
- raise TypeError(
- "cannot serialize %r (type %s)" % (text, type(text).__name__)
- )
-
-def _encode(text, encoding):
- try:
- return text.encode(encoding, "xmlcharrefreplace")
- except (TypeError, AttributeError):
- _raise_serialization_error(text)
-
-def _escape_cdata(text, encoding):
- # escape character data
- try:
- # it's worth avoiding do-nothing calls for strings that are
- # shorter than 500 character, or so. assume that's, by far,
- # the most common case in most applications.
- if "&" in text:
- text = text.replace("&", "&amp;")
- if "<" in text:
- text = text.replace("<", "&lt;")
- if ">" in text:
- text = text.replace(">", "&gt;")
- return text.encode(encoding, "xmlcharrefreplace")
- except (TypeError, AttributeError):
- _raise_serialization_error(text)
-
-
-def _escape_attrib(text, encoding):
- # escape attribute value
- try:
- if "&" in text:
- text = text.replace("&", "&amp;")
- if "<" in text:
- text = text.replace("<", "&lt;")
- if ">" in text:
- text = text.replace(">", "&gt;")
- if "\"" in text:
- text = text.replace("\"", "&quot;")
- if "\n" in text:
- text = text.replace("\n", "&#10;")
- return text.encode(encoding, "xmlcharrefreplace")
- except (TypeError, AttributeError):
- _raise_serialization_error(text)
-
-def _escape_attrib_html(text, encoding):
- # escape attribute value
- try:
- if "&" in text:
- text = text.replace("&", "&amp;")
- if ">" in text:
- text = text.replace(">", "&gt;")
- if "\"" in text:
- text = text.replace("\"", "&quot;")
- return text.encode(encoding, "xmlcharrefreplace")
- except (TypeError, AttributeError):
- _raise_serialization_error(text)
-
-
-def _serialize_html(write, elem, encoding, qnames, namespaces):
- tag = elem.tag
- text = elem.text
- if tag is Comment:
- write("<!--%s-->" % _escape_cdata(text, encoding))
- elif tag is ProcessingInstruction:
- write("<?%s?>" % _escape_cdata(text, encoding))
- else:
- tag = qnames[tag]
- if tag is None:
- if text:
- write(_escape_cdata(text, encoding))
- for e in elem:
- _serialize_html(write, e, encoding, qnames, None)
- else:
- write("<" + tag)
- items = elem.items()
- if items or namespaces:
- items.sort() # lexical order
- for k, v in items:
- if isinstance(k, QName):
- k = k.text
- if isinstance(v, QName):
- v = qnames[v.text]
- else:
- v = _escape_attrib_html(v, encoding)
- # FIXME: handle boolean attributes
- write(" %s=\"%s\"" % (qnames[k], v))
- if namespaces:
- items = namespaces.items()
- items.sort(key=lambda x: x[1]) # sort on prefix
- for v, k in items:
- if k:
- k = ":" + k
- write(" xmlns%s=\"%s\"" % (
- k.encode(encoding),
- _escape_attrib(v, encoding)
- ))
- write(">")
- tag = tag.lower()
- if text:
- if tag == "script" or tag == "style":
- write(_encode(text, encoding))
- else:
- write(_escape_cdata(text, encoding))
- for e in elem:
- _serialize_html(write, e, encoding, qnames, None)
- if tag not in HTML_EMPTY:
- write("</" + tag + ">")
- if elem.tail:
- write(_escape_cdata(elem.tail, encoding))
-
-def write_html(root, f,
- # keyword arguments
- encoding="us-ascii",
- default_namespace=None):
- assert root is not None
- if not hasattr(f, "write"):
- f = open(f, "wb")
- write = f.write
- if not encoding:
- encoding = "us-ascii"
- qnames, namespaces = _namespaces(
- root, encoding, default_namespace
- )
- _serialize_html(
- write, root, encoding, qnames, namespaces
- )
-
-# --------------------------------------------------------------------
-# serialization support
-
-def _namespaces(elem, encoding, default_namespace=None):
- # identify namespaces used in this tree
-
- # maps qnames to *encoded* prefix:local names
- qnames = {None: None}
-
- # maps uri:s to prefixes
- namespaces = {}
- if default_namespace:
- namespaces[default_namespace] = ""
-
- def encode(text):
- return text.encode(encoding)
-
- def add_qname(qname):
- # calculate serialized qname representation
- try:
- if qname[:1] == "{":
- uri, tag = qname[1:].split("}", 1)
- prefix = namespaces.get(uri)
- if prefix is None:
- prefix = _namespace_map.get(uri)
- if prefix is None:
- prefix = "ns%d" % len(namespaces)
- if prefix != "xml":
- namespaces[uri] = prefix
- if prefix:
- qnames[qname] = encode("%s:%s" % (prefix, tag))
- else:
- qnames[qname] = encode(tag) # default element
- else:
- if default_namespace:
- # FIXME: can this be handled in XML 1.0?
- raise ValueError(
- "cannot use non-qualified names with "
- "default_namespace option"
- )
- qnames[qname] = encode(qname)
- except TypeError:
- _raise_serialization_error(qname)
-
- # populate qname and namespaces table
- try:
- iterate = elem.iter
- except AttributeError:
- iterate = elem.getiterator # cET compatibility
- for elem in iterate():
- tag = elem.tag
- if isinstance(tag, QName) and tag.text not in qnames:
- add_qname(tag.text)
- elif isinstance(tag, basestring):
- if tag not in qnames:
- add_qname(tag)
- elif tag is not None and tag is not Comment and tag is not PI:
- _raise_serialization_error(tag)
- for key, value in elem.items():
- if isinstance(key, QName):
- key = key.text
- if key not in qnames:
- add_qname(key)
- if isinstance(value, QName) and value.text not in qnames:
- add_qname(value.text)
- text = elem.text
- if isinstance(text, QName) and text.text not in qnames:
- add_qname(text.text)
- return qnames, namespaces
-
-def to_html_string(element, encoding=None):
- class dummy:
- pass
- data = []
- file = dummy()
- file.write = data.append
- write_html(ElementTree(element).getroot(),file,encoding)
- return "".join(data)
diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py
new file mode 100644
index 0000000..3512d1a
--- /dev/null
+++ b/markdown/htmlparser.py
@@ -0,0 +1,323 @@
+"""
+Python Markdown
+
+A Python implementation of John Gruber's Markdown.
+
+Documentation: https://python-markdown.github.io/
+GitHub: https://github.com/Python-Markdown/markdown/
+PyPI: https://pypi.org/project/Markdown/
+
+Started by Manfred Stienstra (http://www.dwerg.net/).
+Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
+Currently maintained by Waylan Limberg (https://github.com/waylan),
+Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
+
+Copyright 2007-2020 The Python Markdown Project (v. 1.7 and later)
+Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
+Copyright 2004 Manfred Stienstra (the original version)
+
+License: BSD (see LICENSE.md for details).
+"""
+
+import re
+import importlib.util
+import sys
+
+
+# Import a copy of the html.parser lib as `htmlparser` so we can monkeypatch it.
+# Users can still do `from html import parser` and get the default behavior.
+spec = importlib.util.find_spec('html.parser')
+htmlparser = importlib.util.module_from_spec(spec)
+spec.loader.exec_module(htmlparser)
+sys.modules['htmlparser'] = htmlparser
+
+# Monkeypatch HTMLParser to only accept `?>` to close Processing Instructions.
+htmlparser.piclose = re.compile(r'\?>')
+# Monkeypatch HTMLParser to only recognize entity references with a closing semicolon.
+htmlparser.entityref = re.compile(r'&([a-zA-Z][-.a-zA-Z0-9]*);')
+# Monkeypatch HTMLParser to no longer support partial entities. We are always feeding a complete block,
+# so the 'incomplete' functionality is unnecessary. As the entityref regex is run right before incomplete,
+# and the two regex are the same, then incomplete will simply never match and we avoid the logic within.
+htmlparser.incomplete = htmlparser.entityref
+# Monkeypatch HTMLParser to not accept a backtick in a tag name, attribute name, or bare value.
+htmlparser.locatestarttagend_tolerant = re.compile(r"""
+ <[a-zA-Z][^`\t\n\r\f />\x00]* # tag name <= added backtick here
+ (?:[\s/]* # optional whitespace before attribute name
+ (?:(?<=['"\s/])[^`\s/>][^\s/=>]* # attribute name <= added backtick here
+ (?:\s*=+\s* # value indicator
+ (?:'[^']*' # LITA-enclosed value
+ |"[^"]*" # LIT-enclosed value
+ |(?!['"])[^`>\s]* # bare value <= added backtick here
+ )
+ (?:\s*,)* # possibly followed by a comma
+ )?(?:\s|/(?!>))*
+ )*
+ )?
+ \s* # trailing whitespace
+""", re.VERBOSE)
+
+# Match a blank line at the start of a block of text (two newlines).
+# The newlines may be preceded by additional whitespace.
+blank_line_re = re.compile(r'^([ ]*\n){2}')
+
+
+class HTMLExtractor(htmlparser.HTMLParser):
+ """
+ Extract raw HTML from text.
+
+ The raw HTML is stored in the `htmlStash` of the Markdown instance passed
+ to `md` and the remaining text is stored in `cleandoc` as a list of strings.
+ """
+
+ def __init__(self, md, *args, **kwargs):
+ if 'convert_charrefs' not in kwargs:
+ kwargs['convert_charrefs'] = False
+
+ # Block tags that should contain no content (self closing)
+ self.empty_tags = set(['hr'])
+
+ # This calls self.reset
+ super().__init__(*args, **kwargs)
+ self.md = md
+
+ def reset(self):
+ """Reset this instance. Loses all unprocessed data."""
+ self.inraw = False
+ self.intail = False
+ self.stack = [] # When inraw==True, stack contains a list of tags
+ self._cache = []
+ self.cleandoc = []
+ super().reset()
+
+ def close(self):
+ """Handle any buffered data."""
+ super().close()
+ if len(self.rawdata):
+ # Temp fix for https://bugs.python.org/issue41989
+ # TODO: remove this when the bug is fixed in all supported Python versions.
+ if self.convert_charrefs and not self.cdata_elem: # pragma: no cover
+ self.handle_data(htmlparser.unescape(self.rawdata))
+ else:
+ self.handle_data(self.rawdata)
+ # Handle any unclosed tags.
+ if len(self._cache):
+ self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache)))
+ self._cache = []
+
+ @property
+ def line_offset(self):
+ """Returns char index in self.rawdata for the start of the current line. """
+ if self.lineno > 1 and '\n' in self.rawdata:
+ m = re.match(r'([^\n]*\n){{{}}}'.format(self.lineno-1), self.rawdata)
+ if m:
+ return m.end()
+ else: # pragma: no cover
+ # Value of self.lineno must exceed total number of lines.
+ # Find index of beginning of last line.
+ return self.rawdata.rfind('\n')
+ return 0
+
+ def at_line_start(self):
+ """
+ Returns True if current position is at start of line.
+
+ Allows for up to three blank spaces at start of line.
+ """
+ if self.offset == 0:
+ return True
+ if self.offset > 3:
+ return False
+ # Confirm up to first 3 chars are whitespace
+ return self.rawdata[self.line_offset:self.line_offset + self.offset].strip() == ''
+
+ def get_endtag_text(self, tag):
+ """
+ Returns the text of the end tag.
+
+ If it fails to extract the actual text from the raw data, it builds a closing tag with `tag`.
+ """
+ # Attempt to extract actual tag from raw source text
+ start = self.line_offset + self.offset
+ m = htmlparser.endendtag.search(self.rawdata, start)
+ if m:
+ return self.rawdata[start:m.end()]
+ else: # pragma: no cover
+ # Failed to extract from raw data. Assume well formed and lowercase.
+ return '</{}>'.format(tag)
+
+ def handle_starttag(self, tag, attrs):
+ # Handle tags that should always be empty and do not specify a closing tag
+ if tag in self.empty_tags:
+ self.handle_startendtag(tag, attrs)
+ return
+
+ if self.md.is_block_level(tag) and (self.intail or (self.at_line_start() and not self.inraw)):
+ # Started a new raw block. Prepare stack.
+ self.inraw = True
+ self.cleandoc.append('\n')
+
+ text = self.get_starttag_text()
+ if self.inraw:
+ self.stack.append(tag)
+ self._cache.append(text)
+ else:
+ self.cleandoc.append(text)
+ if tag in self.CDATA_CONTENT_ELEMENTS:
+ # This is presumably a standalone tag in a code span (see #1036).
+ self.clear_cdata_mode()
+
+ def handle_endtag(self, tag):
+ text = self.get_endtag_text(tag)
+
+ if self.inraw:
+ self._cache.append(text)
+ if tag in self.stack:
+ # Remove tag from stack
+ while self.stack:
+ if self.stack.pop() == tag:
+ break
+ if len(self.stack) == 0:
+ # End of raw block.
+ if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(text):]):
+ # Preserve blank line and end of raw block.
+ self._cache.append('\n')
+ else:
+ # More content exists after endtag.
+ self.intail = True
+ # Reset stack.
+ self.inraw = False
+ self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache)))
+ # Insert blank line between this and next line.
+ self.cleandoc.append('\n\n')
+ self._cache = []
+ else:
+ self.cleandoc.append(text)
+
+ def handle_data(self, data):
+ if self.intail and '\n' in data:
+ self.intail = False
+ if self.inraw:
+ self._cache.append(data)
+ else:
+ self.cleandoc.append(data)
+
+ def handle_empty_tag(self, data, is_block):
+ """ Handle empty tags (`<data>`). """
+ if self.inraw or self.intail:
+ # Append this to the existing raw block
+ self._cache.append(data)
+ elif self.at_line_start() and is_block:
+ # Handle this as a standalone raw block
+ if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(data):]):
+ # Preserve blank line after tag in raw block.
+ data += '\n'
+ else:
+ # More content exists after tag.
+ self.intail = True
+ item = self.cleandoc[-1] if self.cleandoc else ''
+ # If we only have one newline before block element, add another
+ if not item.endswith('\n\n') and item.endswith('\n'):
+ self.cleandoc.append('\n')
+ self.cleandoc.append(self.md.htmlStash.store(data))
+ # Insert blank line between this and next line.
+ self.cleandoc.append('\n\n')
+ else:
+ self.cleandoc.append(data)
+
+ def handle_startendtag(self, tag, attrs):
+ self.handle_empty_tag(self.get_starttag_text(), is_block=self.md.is_block_level(tag))
+
+ def handle_charref(self, name):
+ self.handle_empty_tag('&#{};'.format(name), is_block=False)
+
+ def handle_entityref(self, name):
+ self.handle_empty_tag('&{};'.format(name), is_block=False)
+
+ def handle_comment(self, data):
+ self.handle_empty_tag('<!--{}-->'.format(data), is_block=True)
+
+ def handle_decl(self, data):
+ self.handle_empty_tag('<!{}>'.format(data), is_block=True)
+
+ def handle_pi(self, data):
+ self.handle_empty_tag('<?{}?>'.format(data), is_block=True)
+
+ def unknown_decl(self, data):
+ end = ']]>' if data.startswith('CDATA[') else ']>'
+ self.handle_empty_tag('<![{}{}'.format(data, end), is_block=True)
+
+ def parse_pi(self, i):
+ if self.at_line_start() or self.intail:
+ return super().parse_pi(i)
+ # This is not the beginning of a raw block so treat as plain data
+ # and avoid consuming any tags which may follow (see #1066).
+ self.handle_data('<?')
+ return i + 2
+
+ def parse_html_declaration(self, i):
+ if self.at_line_start() or self.intail:
+ return super().parse_html_declaration(i)
+ # This is not the beginning of a raw block so treat as plain data
+ # and avoid consuming any tags which may follow (see #1066).
+ self.handle_data('<!')
+ return i + 2
+
+ # The rest has been copied from base class in standard lib to address #1036.
+ # As __startag_text is private, all references to it must be in this subclass.
+ # The last few lines of parse_starttag are reversed so that handle_starttag
+ # can override cdata_mode in certain situations (in a code span).
+ __starttag_text = None
+
+ def get_starttag_text(self):
+ """Return full source of start tag: '<...>'."""
+ return self.__starttag_text
+
+ def parse_starttag(self, i): # pragma: no cover
+ self.__starttag_text = None
+ endpos = self.check_for_whole_start_tag(i)
+ if endpos < 0:
+ return endpos
+ rawdata = self.rawdata
+ self.__starttag_text = rawdata[i:endpos]
+
+ # Now parse the data between i+1 and j into a tag and attrs
+ attrs = []
+ match = htmlparser.tagfind_tolerant.match(rawdata, i+1)
+ assert match, 'unexpected call to parse_starttag()'
+ k = match.end()
+ self.lasttag = tag = match.group(1).lower()
+ while k < endpos:
+ m = htmlparser.attrfind_tolerant.match(rawdata, k)
+ if not m:
+ break
+ attrname, rest, attrvalue = m.group(1, 2, 3)
+ if not rest:
+ attrvalue = None
+ elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
+ attrvalue[:1] == '"' == attrvalue[-1:]: # noqa: E127
+ attrvalue = attrvalue[1:-1]
+ if attrvalue:
+ attrvalue = htmlparser.unescape(attrvalue)
+ attrs.append((attrname.lower(), attrvalue))
+ k = m.end()
+
+ end = rawdata[k:endpos].strip()
+ if end not in (">", "/>"):
+ lineno, offset = self.getpos()
+ if "\n" in self.__starttag_text:
+ lineno = lineno + self.__starttag_text.count("\n")
+ offset = len(self.__starttag_text) \
+ - self.__starttag_text.rfind("\n") # noqa: E127
+ else:
+ offset = offset + len(self.__starttag_text)
+ self.handle_data(rawdata[i:endpos])
+ return endpos
+ if end.endswith('/>'):
+ # XHTML-style empty tag: <span attr="value" />
+ self.handle_startendtag(tag, attrs)
+ else:
+ # *** set cdata_mode first so we can override it in handle_starttag (see #1036) ***
+ if tag in self.CDATA_CONTENT_ELEMENTS:
+ self.set_cdata_mode(tag)
+ self.handle_starttag(tag, attrs)
+ return endpos
diff --git a/markdown/inlinepatterns.py b/markdown/inlinepatterns.py
index 917a9d3..eb313bd 100644
--- a/markdown/inlinepatterns.py
+++ b/markdown/inlinepatterns.py
@@ -1,4 +1,23 @@
"""
+Python Markdown
+
+A Python implementation of John Gruber's Markdown.
+
+Documentation: https://python-markdown.github.io/
+GitHub: https://github.com/Python-Markdown/markdown/
+PyPI: https://pypi.org/project/Markdown/
+
+Started by Manfred Stienstra (http://www.dwerg.net/).
+Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
+Currently maintained by Waylan Limberg (https://github.com/waylan),
+Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
+
+Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later)
+Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
+Copyright 2004 Manfred Stienstra (the original version)
+
+License: BSD (see LICENSE.md for details).
+
INLINE PATTERNS
=============================================================================
@@ -41,71 +60,129 @@ So, we apply the expressions in the following order:
* finally we apply strong and emphasis
"""
-import markdown
+from . import util
+from collections import namedtuple
import re
-from urlparse import urlparse, urlunparse
-import sys
-if sys.version >= "3.0":
- from html import entities as htmlentitydefs
-else:
- import htmlentitydefs
+import xml.etree.ElementTree as etree
+try: # pragma: no cover
+ from html import entities
+except ImportError: # pragma: no cover
+ import htmlentitydefs as entities
+
+
+def build_inlinepatterns(md, **kwargs):
+ """ Build the default set of inline patterns for Markdown. """
+ inlinePatterns = util.Registry()
+ inlinePatterns.register(BacktickInlineProcessor(BACKTICK_RE), 'backtick', 190)
+ inlinePatterns.register(EscapeInlineProcessor(ESCAPE_RE, md), 'escape', 180)
+ inlinePatterns.register(ReferenceInlineProcessor(REFERENCE_RE, md), 'reference', 170)
+ inlinePatterns.register(LinkInlineProcessor(LINK_RE, md), 'link', 160)
+ inlinePatterns.register(ImageInlineProcessor(IMAGE_LINK_RE, md), 'image_link', 150)
+ inlinePatterns.register(
+ ImageReferenceInlineProcessor(IMAGE_REFERENCE_RE, md), 'image_reference', 140
+ )
+ inlinePatterns.register(
+ ShortReferenceInlineProcessor(REFERENCE_RE, md), 'short_reference', 130
+ )
+ inlinePatterns.register(
+ ShortImageReferenceInlineProcessor(IMAGE_REFERENCE_RE, md), 'short_image_ref', 125
+ )
+ inlinePatterns.register(AutolinkInlineProcessor(AUTOLINK_RE, md), 'autolink', 120)
+ inlinePatterns.register(AutomailInlineProcessor(AUTOMAIL_RE, md), 'automail', 110)
+ inlinePatterns.register(SubstituteTagInlineProcessor(LINE_BREAK_RE, 'br'), 'linebreak', 100)
+ inlinePatterns.register(HtmlInlineProcessor(HTML_RE, md), 'html', 90)
+ inlinePatterns.register(HtmlInlineProcessor(ENTITY_RE, md), 'entity', 80)
+ inlinePatterns.register(SimpleTextInlineProcessor(NOT_STRONG_RE), 'not_strong', 70)
+ inlinePatterns.register(AsteriskProcessor(r'\*'), 'em_strong', 60)
+ inlinePatterns.register(UnderscoreProcessor(r'_'), 'em_strong2', 50)
+ return inlinePatterns
+
"""
The actual regular expressions for patterns
-----------------------------------------------------------------------------
"""
-NOBRACKET = r'[^\]\[]*'
-BRK = ( r'\[('
- + (NOBRACKET + r'(\[')*6
- + (NOBRACKET+ r'\])*')*6
- + NOBRACKET + r')\]' )
NOIMG = r'(?<!\!)'
-BACKTICK_RE = r'(?<!\\)(`+)(.+?)(?<!`)\2(?!`)' # `e=f()` or ``e=f("`")``
-ESCAPE_RE = r'\\(.)' # \<
-EMPHASIS_RE = r'(\*)([^\*]+)\2' # *emphasis*
-STRONG_RE = r'(\*{2}|_{2})(.+?)\2' # **strong**
-STRONG_EM_RE = r'(\*{3}|_{3})(.+?)\2' # ***strong***
+# `e=f()` or ``e=f("`")``
+BACKTICK_RE = r'(?:(?<!\\)((?:\\{2})+)(?=`+)|(?<!\\)(`+)(.+?)(?<!`)\2(?!`))'
+
+# \<
+ESCAPE_RE = r'\\(.)'
+
+# *emphasis*
+EMPHASIS_RE = r'(\*)([^\*]+)\1'
+
+# **strong**
+STRONG_RE = r'(\*{2})(.+?)\1'
+
+# __smart__strong__
+SMART_STRONG_RE = r'(?<!\w)(_{2})(?!_)(.+?)(?<!_)\1(?!\w)'
-if markdown.SMART_EMPHASIS:
- EMPHASIS_2_RE = r'(?<!\w)(_)(\S.+?)\2(?!\w)' # _emphasis_
-else:
- EMPHASIS_2_RE = r'(_)(.+?)\2' # _emphasis_
+# _smart_emphasis_
+SMART_EMPHASIS_RE = r'(?<!\w)(_)(?!_)(.+?)(?<!_)\1(?!\w)'
-LINK_RE = NOIMG + BRK + \
-r'''\(\s*(<.*?>|((?:(?:\(.*?\))|[^\(\)]))*?)\s*((['"])(.*?)\12)?\)'''
-# [text](url) or [text](<url>)
+# __strong _em__
+SMART_STRONG_EM_RE = r'(?<!\w)(\_)\1(?!\1)(.+?)(?<!\w)\1(?!\1)(.+?)\1{3}(?!\w)'
+
+# ***strongem*** or ***em*strong**
+EM_STRONG_RE = r'(\*)\1{2}(.+?)\1(.*?)\1{2}'
+
+# ___strongem___ or ___em_strong__
+EM_STRONG2_RE = r'(_)\1{2}(.+?)\1(.*?)\1{2}'
+
+# ***strong**em*
+STRONG_EM_RE = r'(\*)\1{2}(.+?)\1{2}(.*?)\1'
+
+# ___strong__em_
+STRONG_EM2_RE = r'(_)\1{2}(.+?)\1{2}(.*?)\1'
+
+# **strong*em***
+STRONG_EM3_RE = r'(\*)\1(?!\1)([^*]+?)\1(?!\1)(.+?)\1{3}'
+
+# [text](url) or [text](<url>) or [text](url "title")
+LINK_RE = NOIMG + r'\['
-IMAGE_LINK_RE = r'\!' + BRK + r'\s*\((<.*?>|([^\)]*))\)'
# ![alttxt](http://x.com/) or ![alttxt](<http://x.com/>)
-REFERENCE_RE = NOIMG + BRK+ r'\s*\[([^\]]*)\]' # [Google][3]
-IMAGE_REFERENCE_RE = r'\!' + BRK + '\s*\[([^\]]*)\]' # ![alt text][2]
-NOT_STRONG_RE = r'((^| )(\*|_)( |$))' # stand-alone * or _
-AUTOLINK_RE = r'<((?:f|ht)tps?://[^>]*)>' # <http://www.123.com>
-AUTOMAIL_RE = r'<([^> \!]*@[^> ]*)>' # <me@example.com>
+IMAGE_LINK_RE = r'\!\['
+
+# [Google][3]
+REFERENCE_RE = LINK_RE
+
+# ![alt text][2]
+IMAGE_REFERENCE_RE = IMAGE_LINK_RE
+
+# stand-alone * or _
+NOT_STRONG_RE = r'((^|\s)(\*|_)(\s|$))'
+
+# <http://www.123.com>
+AUTOLINK_RE = r'<((?:[Ff]|[Hh][Tt])[Tt][Pp][Ss]?://[^<>]*)>'
-HTML_RE = r'(\<([a-zA-Z/][^\>]*?|\!--.*?--)\>)' # <...>
-ENTITY_RE = r'(&[\#a-zA-Z0-9]*;)' # &amp;
-LINE_BREAK_RE = r' \n' # two spaces at end of line
-LINE_BREAK_2_RE = r' $' # two spaces at end of text
+# <me@example.com>
+AUTOMAIL_RE = r'<([^<> !]+@[^@<> ]+)>'
+
+# <...>
+HTML_RE = r'(<(\/?[a-zA-Z][^<>@ ]*( [^<>]*)?|!--(?:(?!<!--|-->).)*--)>)'
+
+# "&#38;" (decimal) or "&#x26;" (hex) or "&amp;" (named)
+ENTITY_RE = r'(&(?:\#[0-9]+|\#x[0-9a-fA-F]+|[a-zA-Z0-9]+);)'
+
+# two spaces at end of line
+LINE_BREAK_RE = r' \n'
def dequote(string):
"""Remove quotes from around a string."""
- if ( ( string.startswith('"') and string.endswith('"'))
- or (string.startswith("'") and string.endswith("'")) ):
+ if ((string.startswith('"') and string.endswith('"')) or
+ (string.startswith("'") and string.endswith("'"))):
return string[1:-1]
else:
return string
-ATTR_RE = re.compile("\{@([^\}]*)=([^\}]*)}") # {@id=123}
-def handleAttributes(text, parent):
- """Set values of an element based on attribute definitions ({@id=123})."""
- def attributeCallback(match):
- parent.set(match.group(1), match.group(2).replace('\n', ' '))
- return ATTR_RE.sub(attributeCallback, text)
+class EmStrongItem(namedtuple('EmStrongItem', ['pattern', 'builder', 'tags'])):
+ """Emphasis/strong pattern item."""
"""
@@ -113,10 +190,13 @@ The pattern classes
-----------------------------------------------------------------------------
"""
-class Pattern:
+
+class Pattern: # pragma: no cover
"""Base class that inline patterns subclass. """
- def __init__ (self, pattern, markdown_instance=None):
+ ANCESTOR_EXCLUDES = tuple()
+
+ def __init__(self, pattern, md=None):
"""
Create an instant of an inline pattern.
@@ -126,14 +206,12 @@ class Pattern:
"""
self.pattern = pattern
- self.compiled_re = re.compile("^(.*?)%s(.*?)$" % pattern, re.DOTALL)
+ self.compiled_re = re.compile(r"^(.*?)%s(.*)$" % pattern,
+ re.DOTALL | re.UNICODE)
- # Api for Markdown to pass safe_mode into instance
- self.safe_mode = False
- if markdown_instance:
- self.markdown = markdown_instance
+ self.md = md
- def getCompiledRegExp (self):
+ def getCompiledRegExp(self):
""" Return a compiled regular expression. """
return self.compiled_re
@@ -147,57 +225,163 @@ class Pattern:
* m: A re match object containing a match of the pattern.
"""
- pass
+ pass # pragma: no cover
def type(self):
""" Return class name, to define pattern type """
return self.__class__.__name__
-BasePattern = Pattern # for backward compatibility
+ def unescape(self, text):
+ """ Return unescaped text given text with an inline placeholder. """
+ try:
+ stash = self.md.treeprocessors['inline'].stashed_nodes
+ except KeyError: # pragma: no cover
+ return text
+
+ def get_stash(m):
+ id = m.group(1)
+ if id in stash:
+ value = stash.get(id)
+ if isinstance(value, str):
+ return value
+ else:
+ # An etree Element - return text content only
+ return ''.join(value.itertext())
+ return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)
+
+
+class InlineProcessor(Pattern):
+ """
+ Base class that inline patterns subclass.
+
+ This is the newer style inline processor that uses a more
+ efficient and flexible search approach.
+ """
+
+ def __init__(self, pattern, md=None):
+ """
+ Create an instant of an inline pattern.
+
+ Keyword arguments:
+
+ * pattern: A regular expression that matches a pattern
+
+ """
+ self.pattern = pattern
+ self.compiled_re = re.compile(pattern, re.DOTALL | re.UNICODE)
+
+ # Api for Markdown to pass safe_mode into instance
+ self.safe_mode = False
+ self.md = md
+
+ def handleMatch(self, m, data):
+ """Return a ElementTree element from the given match and the
+ start and end index of the matched text.
+
+ If `start` and/or `end` are returned as `None`, it will be
+ assumed that the processor did not find a valid region of text.
+
+ Subclasses should override this method.
+
+ Keyword arguments:
+
+ * m: A re match object containing a match of the pattern.
+ * data: The buffer current under analysis
-class SimpleTextPattern (Pattern):
+ Returns:
+
+ * el: The ElementTree element, text or None.
+ * start: The start of the region that has been matched or None.
+ * end: The end of the region that has been matched or None.
+
+ """
+ pass # pragma: no cover
+
+
+class SimpleTextPattern(Pattern): # pragma: no cover
""" Return a simple text of group(2) of a Pattern. """
def handleMatch(self, m):
- text = m.group(2)
- if text == markdown.INLINE_PLACEHOLDER_PREFIX:
- return None
- return text
+ return m.group(2)
+
+
+class SimpleTextInlineProcessor(InlineProcessor):
+ """ Return a simple text of group(1) of a Pattern. """
+ def handleMatch(self, m, data):
+ return m.group(1), m.start(0), m.end(0)
+
+
+class EscapeInlineProcessor(InlineProcessor):
+ """ Return an escaped character. """
-class SimpleTagPattern (Pattern):
+ def handleMatch(self, m, data):
+ char = m.group(1)
+ if char in self.md.ESCAPED_CHARS:
+ return '{}{}{}'.format(util.STX, ord(char), util.ETX), m.start(0), m.end(0)
+ else:
+ return None, m.start(0), m.end(0)
+
+
+class SimpleTagPattern(Pattern): # pragma: no cover
"""
Return element of type `tag` with a text attribute of group(3)
of a Pattern.
"""
- def __init__ (self, pattern, tag):
+ def __init__(self, pattern, tag):
Pattern.__init__(self, pattern)
self.tag = tag
def handleMatch(self, m):
- el = markdown.etree.Element(self.tag)
+ el = etree.Element(self.tag)
el.text = m.group(3)
return el
-class SubstituteTagPattern (SimpleTagPattern):
- """ Return a eLement of type `tag` with no children. """
- def handleMatch (self, m):
- return markdown.etree.Element(self.tag)
+class SimpleTagInlineProcessor(InlineProcessor):
+ """
+ Return element of type `tag` with a text attribute of group(2)
+ of a Pattern.
+ """
+ def __init__(self, pattern, tag):
+ InlineProcessor.__init__(self, pattern)
+ self.tag = tag
-class BacktickPattern (Pattern):
- """ Return a `<code>` element containing the matching text. """
- def __init__ (self, pattern):
- Pattern.__init__(self, pattern)
- self.tag = "code"
+ def handleMatch(self, m, data): # pragma: no cover
+ el = etree.Element(self.tag)
+ el.text = m.group(2)
+ return el, m.start(0), m.end(0)
+
+class SubstituteTagPattern(SimpleTagPattern): # pragma: no cover
+ """ Return an element of type `tag` with no children. """
def handleMatch(self, m):
- el = markdown.etree.Element(self.tag)
- el.text = markdown.AtomicString(m.group(3).strip())
- return el
+ return etree.Element(self.tag)
+
+
+class SubstituteTagInlineProcessor(SimpleTagInlineProcessor):
+ """ Return an element of type `tag` with no children. """
+ def handleMatch(self, m, data):
+ return etree.Element(self.tag), m.start(0), m.end(0)
+
+
+class BacktickInlineProcessor(InlineProcessor):
+ """ Return a `<code>` element containing the matching text. """
+ def __init__(self, pattern):
+ InlineProcessor.__init__(self, pattern)
+ self.ESCAPED_BSLASH = '{}{}{}'.format(util.STX, ord('\\'), util.ETX)
+ self.tag = 'code'
+
+ def handleMatch(self, m, data):
+ if m.group(3):
+ el = etree.Element(self.tag)
+ el.text = util.AtomicString(util.code_escape(m.group(3).strip()))
+ return el, m.start(0), m.end(0)
+ else:
+ return m.group(1).replace('\\\\', self.ESCAPED_BSLASH), m.start(0), m.end(0)
-class DoubleTagPattern (SimpleTagPattern):
+class DoubleTagPattern(SimpleTagPattern): # pragma: no cover
"""Return a ElementTree element nested in tag2 nested in tag1.
Useful for strong emphasis etc.
@@ -205,117 +389,432 @@ class DoubleTagPattern (SimpleTagPattern):
"""
def handleMatch(self, m):
tag1, tag2 = self.tag.split(",")
- el1 = markdown.etree.Element(tag1)
- el2 = markdown.etree.SubElement(el1, tag2)
+ el1 = etree.Element(tag1)
+ el2 = etree.SubElement(el1, tag2)
el2.text = m.group(3)
+ if len(m.groups()) == 5:
+ el2.tail = m.group(4)
return el1
-class HtmlPattern (Pattern):
+class DoubleTagInlineProcessor(SimpleTagInlineProcessor):
+ """Return a ElementTree element nested in tag2 nested in tag1.
+
+ Useful for strong emphasis etc.
+
+ """
+ def handleMatch(self, m, data): # pragma: no cover
+ tag1, tag2 = self.tag.split(",")
+ el1 = etree.Element(tag1)
+ el2 = etree.SubElement(el1, tag2)
+ el2.text = m.group(2)
+ if len(m.groups()) == 3:
+ el2.tail = m.group(3)
+ return el1, m.start(0), m.end(0)
+
+
+class HtmlInlineProcessor(InlineProcessor):
""" Store raw inline html and return a placeholder. """
- def handleMatch (self, m):
- rawhtml = m.group(2)
- inline = True
- place_holder = self.markdown.htmlStash.store(rawhtml)
- return place_holder
+ def handleMatch(self, m, data):
+ rawhtml = self.unescape(m.group(1))
+ place_holder = self.md.htmlStash.store(rawhtml)
+ return place_holder, m.start(0), m.end(0)
+
+ def unescape(self, text):
+ """ Return unescaped text given text with an inline placeholder. """
+ try:
+ stash = self.md.treeprocessors['inline'].stashed_nodes
+ except KeyError: # pragma: no cover
+ return text
+
+ def get_stash(m):
+ id = m.group(1)
+ value = stash.get(id)
+ if value is not None:
+ try:
+ return self.md.serializer(value)
+ except Exception:
+ return r'\%s' % value
+
+ return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)
+
+
+class AsteriskProcessor(InlineProcessor):
+ """Emphasis processor for handling strong and em matches inside asterisks."""
+
+ PATTERNS = [
+ EmStrongItem(re.compile(EM_STRONG_RE, re.DOTALL | re.UNICODE), 'double', 'strong,em'),
+ EmStrongItem(re.compile(STRONG_EM_RE, re.DOTALL | re.UNICODE), 'double', 'em,strong'),
+ EmStrongItem(re.compile(STRONG_EM3_RE, re.DOTALL | re.UNICODE), 'double2', 'strong,em'),
+ EmStrongItem(re.compile(STRONG_RE, re.DOTALL | re.UNICODE), 'single', 'strong'),
+ EmStrongItem(re.compile(EMPHASIS_RE, re.DOTALL | re.UNICODE), 'single', 'em')
+ ]
+
+ def build_single(self, m, tag, idx):
+ """Return single tag."""
+ el1 = etree.Element(tag)
+ text = m.group(2)
+ self.parse_sub_patterns(text, el1, None, idx)
+ return el1
+ def build_double(self, m, tags, idx):
+ """Return double tag."""
-class LinkPattern (Pattern):
- """ Return a link element from the given match. """
- def handleMatch(self, m):
- el = markdown.etree.Element("a")
- el.text = m.group(2)
- title = m.group(11)
- href = m.group(9)
+ tag1, tag2 = tags.split(",")
+ el1 = etree.Element(tag1)
+ el2 = etree.Element(tag2)
+ text = m.group(2)
+ self.parse_sub_patterns(text, el2, None, idx)
+ el1.append(el2)
+ if len(m.groups()) == 3:
+ text = m.group(3)
+ self.parse_sub_patterns(text, el1, el2, idx)
+ return el1
- if href:
- if href[0] == "<":
- href = href[1:-1]
- el.set("href", self.sanitize_url(href.strip()))
- else:
- el.set("href", "")
+ def build_double2(self, m, tags, idx):
+ """Return double tags (variant 2): `<strong>text <em>text</em></strong>`."""
- if title:
- title = dequote(title) #.replace('"', "&quot;")
- el.set("title", title)
- return el
+ tag1, tag2 = tags.split(",")
+ el1 = etree.Element(tag1)
+ el2 = etree.Element(tag2)
+ text = m.group(2)
+ self.parse_sub_patterns(text, el1, None, idx)
+ text = m.group(3)
+ el1.append(el2)
+ self.parse_sub_patterns(text, el2, None, idx)
+ return el1
- def sanitize_url(self, url):
+ def parse_sub_patterns(self, data, parent, last, idx):
"""
- Sanitize a url against xss attacks in "safe_mode".
-
- Rather than specifically blacklisting `javascript:alert("XSS")` and all
- its aliases (see <http://ha.ckers.org/xss.html>), we whitelist known
- safe url formats. Most urls contain a network location, however some
- are known not to (i.e.: mailto links). Script urls do not contain a
- location. Additionally, for `javascript:...`, the scheme would be
- "javascript" but some aliases will appear to `urlparse()` to have no
- scheme. On top of that relative links (i.e.: "foo/bar.html") have no
- scheme. Therefore we must check "path", "parameters", "query" and
- "fragment" for any literal colons. We don't check "scheme" for colons
- because it *should* never have any and "netloc" must allow the form:
- `username:password@host:port`.
+ Parses sub patterns.
+
+ `data` (`str`):
+ text to evaluate.
+
+ `parent` (`etree.Element`):
+ Parent to attach text and sub elements to.
+
+ `last` (`etree.Element`):
+ Last appended child to parent. Can also be None if parent has no children.
+
+ `idx` (`int`):
+ Current pattern index that was used to evaluate the parent.
"""
- locless_schemes = ['', 'mailto', 'news']
- scheme, netloc, path, params, query, fragment = url = urlparse(url)
- safe_url = False
- if netloc != '' or scheme in locless_schemes:
- safe_url = True
-
- for part in url[2:]:
- if ":" in part:
- safe_url = False
-
- if self.markdown.safeMode and not safe_url:
- return ''
+
+ offset = 0
+ pos = 0
+
+ length = len(data)
+ while pos < length:
+ # Find the start of potential emphasis or strong tokens
+ if self.compiled_re.match(data, pos):
+ matched = False
+ # See if the we can match an emphasis/strong pattern
+ for index, item in enumerate(self.PATTERNS):
+ # Only evaluate patterns that are after what was used on the parent
+ if index <= idx:
+ continue
+ m = item.pattern.match(data, pos)
+ if m:
+ # Append child nodes to parent
+ # Text nodes should be appended to the last
+ # child if present, and if not, it should
+ # be added as the parent's text node.
+ text = data[offset:m.start(0)]
+ if text:
+ if last is not None:
+ last.tail = text
+ else:
+ parent.text = text
+ el = self.build_element(m, item.builder, item.tags, index)
+ parent.append(el)
+ last = el
+ # Move our position past the matched hunk
+ offset = pos = m.end(0)
+ matched = True
+ if not matched:
+ # We matched nothing, move on to the next character
+ pos += 1
+ else:
+ # Increment position as no potential emphasis start was found.
+ pos += 1
+
+ # Append any leftover text as a text node.
+ text = data[offset:]
+ if text:
+ if last is not None:
+ last.tail = text
+ else:
+ parent.text = text
+
+ def build_element(self, m, builder, tags, index):
+ """Element builder."""
+
+ if builder == 'double2':
+ return self.build_double2(m, tags, index)
+ elif builder == 'double':
+ return self.build_double(m, tags, index)
else:
- return urlunparse(url)
+ return self.build_single(m, tags, index)
+
+ def handleMatch(self, m, data):
+ """Parse patterns."""
+
+ el = None
+ start = None
+ end = None
+
+ for index, item in enumerate(self.PATTERNS):
+ m1 = item.pattern.match(data, m.start(0))
+ if m1:
+ start = m1.start(0)
+ end = m1.end(0)
+ el = self.build_element(m1, item.builder, item.tags, index)
+ break
+ return el, start, end
-class ImagePattern(LinkPattern):
+
+class UnderscoreProcessor(AsteriskProcessor):
+ """Emphasis processor for handling strong and em matches inside underscores."""
+
+ PATTERNS = [
+ EmStrongItem(re.compile(EM_STRONG2_RE, re.DOTALL | re.UNICODE), 'double', 'strong,em'),
+ EmStrongItem(re.compile(STRONG_EM2_RE, re.DOTALL | re.UNICODE), 'double', 'em,strong'),
+ EmStrongItem(re.compile(SMART_STRONG_EM_RE, re.DOTALL | re.UNICODE), 'double2', 'strong,em'),
+ EmStrongItem(re.compile(SMART_STRONG_RE, re.DOTALL | re.UNICODE), 'single', 'strong'),
+ EmStrongItem(re.compile(SMART_EMPHASIS_RE, re.DOTALL | re.UNICODE), 'single', 'em')
+ ]
+
+
+class LinkInlineProcessor(InlineProcessor):
+ """ Return a link element from the given match. """
+ RE_LINK = re.compile(r'''\(\s*(?:(<[^<>]*>)\s*(?:('[^']*'|"[^"]*")\s*)?\))?''', re.DOTALL | re.UNICODE)
+ RE_TITLE_CLEAN = re.compile(r'\s')
+
+ def handleMatch(self, m, data):
+ text, index, handled = self.getText(data, m.end(0))
+
+ if not handled:
+ return None, None, None
+
+ href, title, index, handled = self.getLink(data, index)
+ if not handled:
+ return None, None, None
+
+ el = etree.Element("a")
+ el.text = text
+
+ el.set("href", href)
+
+ if title is not None:
+ el.set("title", title)
+
+ return el, m.start(0), index
+
+ def getLink(self, data, index):
+ """Parse data between `()` of `[Text]()` allowing recursive `()`. """
+
+ href = ''
+ title = None
+ handled = False
+
+ m = self.RE_LINK.match(data, pos=index)
+ if m and m.group(1):
+ # Matches [Text](<link> "title")
+ href = m.group(1)[1:-1].strip()
+ if m.group(2):
+ title = m.group(2)[1:-1]
+ index = m.end(0)
+ handled = True
+ elif m:
+ # Track bracket nesting and index in string
+ bracket_count = 1
+ backtrack_count = 1
+ start_index = m.end()
+ index = start_index
+ last_bracket = -1
+
+ # Primary (first found) quote tracking.
+ quote = None
+ start_quote = -1
+ exit_quote = -1
+ ignore_matches = False
+
+ # Secondary (second found) quote tracking.
+ alt_quote = None
+ start_alt_quote = -1
+ exit_alt_quote = -1
+
+ # Track last character
+ last = ''
+
+ for pos in range(index, len(data)):
+ c = data[pos]
+ if c == '(':
+ # Count nested (
+ # Don't increment the bracket count if we are sure we're in a title.
+ if not ignore_matches:
+ bracket_count += 1
+ elif backtrack_count > 0:
+ backtrack_count -= 1
+ elif c == ')':
+ # Match nested ) to (
+ # Don't decrement if we are sure we are in a title that is unclosed.
+ if ((exit_quote != -1 and quote == last) or (exit_alt_quote != -1 and alt_quote == last)):
+ bracket_count = 0
+ elif not ignore_matches:
+ bracket_count -= 1
+ elif backtrack_count > 0:
+ backtrack_count -= 1
+ # We've found our backup end location if the title doesn't resolve.
+ if backtrack_count == 0:
+ last_bracket = index + 1
+
+ elif c in ("'", '"'):
+ # Quote has started
+ if not quote:
+ # We'll assume we are now in a title.
+ # Brackets are quoted, so no need to match them (except for the final one).
+ ignore_matches = True
+ backtrack_count = bracket_count
+ bracket_count = 1
+ start_quote = index + 1
+ quote = c
+ # Secondary quote (in case the first doesn't resolve): [text](link'"title")
+ elif c != quote and not alt_quote:
+ start_alt_quote = index + 1
+ alt_quote = c
+ # Update primary quote match
+ elif c == quote:
+ exit_quote = index + 1
+ # Update secondary quote match
+ elif alt_quote and c == alt_quote:
+ exit_alt_quote = index + 1
+
+ index += 1
+
+ # Link is closed, so let's break out of the loop
+ if bracket_count == 0:
+ # Get the title if we closed a title string right before link closed
+ if exit_quote >= 0 and quote == last:
+ href = data[start_index:start_quote - 1]
+ title = ''.join(data[start_quote:exit_quote - 1])
+ elif exit_alt_quote >= 0 and alt_quote == last:
+ href = data[start_index:start_alt_quote - 1]
+ title = ''.join(data[start_alt_quote:exit_alt_quote - 1])
+ else:
+ href = data[start_index:index - 1]
+ break
+
+ if c != ' ':
+ last = c
+
+ # We have a scenario: [test](link"notitle)
+ # When we enter a string, we stop tracking bracket resolution in the main counter,
+ # but we do keep a backup counter up until we discover where we might resolve all brackets
+ # if the title string fails to resolve.
+ if bracket_count != 0 and backtrack_count == 0:
+ href = data[start_index:last_bracket - 1]
+ index = last_bracket
+ bracket_count = 0
+
+ handled = bracket_count == 0
+
+ if title is not None:
+ title = self.RE_TITLE_CLEAN.sub(' ', dequote(self.unescape(title.strip())))
+
+ href = self.unescape(href).strip()
+
+ return href, title, index, handled
+
+ def getText(self, data, index):
+ """Parse the content between `[]` of the start of an image or link
+ resolving nested square brackets.
+
+ """
+ bracket_count = 1
+ text = []
+ for pos in range(index, len(data)):
+ c = data[pos]
+ if c == ']':
+ bracket_count -= 1
+ elif c == '[':
+ bracket_count += 1
+ index += 1
+ if bracket_count == 0:
+ break
+ text.append(c)
+ return ''.join(text), index, bracket_count == 0
+
+
+class ImageInlineProcessor(LinkInlineProcessor):
""" Return a img element from the given match. """
- def handleMatch(self, m):
- el = markdown.etree.Element("img")
- src_parts = m.group(9).split()
- if src_parts:
- src = src_parts[0]
- if src[0] == "<" and src[-1] == ">":
- src = src[1:-1]
- el.set('src', self.sanitize_url(src))
- else:
- el.set('src', "")
- if len(src_parts) > 1:
- el.set('title', dequote(" ".join(src_parts[1:])))
- if markdown.ENABLE_ATTRIBUTES:
- truealt = handleAttributes(m.group(2), el)
- else:
- truealt = m.group(2)
+ def handleMatch(self, m, data):
+ text, index, handled = self.getText(data, m.end(0))
+ if not handled:
+ return None, None, None
- el.set('alt', truealt)
- return el
+ src, title, index, handled = self.getLink(data, index)
+ if not handled:
+ return None, None, None
-class ReferencePattern(LinkPattern):
+ el = etree.Element("img")
+
+ el.set("src", src)
+
+ if title is not None:
+ el.set("title", title)
+
+ el.set('alt', self.unescape(text))
+ return el, m.start(0), index
+
+
+class ReferenceInlineProcessor(LinkInlineProcessor):
""" Match to a stored reference and return link element. """
- def handleMatch(self, m):
- if m.group(9):
- id = m.group(9).lower()
- else:
- # if we got something like "[Google][]"
- # we'll use "google" as the id
- id = m.group(2).lower()
+ NEWLINE_CLEANUP_RE = re.compile(r'\s+', re.MULTILINE)
- if not id in self.markdown.references: # ignore undefined refs
- return None
- href, title = self.markdown.references[id]
+ RE_LINK = re.compile(r'\s?\[([^\]]*)\]', re.DOTALL | re.UNICODE)
- text = m.group(2)
- return self.makeTag(href, title, text)
+ def handleMatch(self, m, data):
+ text, index, handled = self.getText(data, m.end(0))
+ if not handled:
+ return None, None, None
+
+ id, end, handled = self.evalId(data, index, text)
+ if not handled:
+ return None, None, None
+
+ # Clean up linebreaks in id
+ id = self.NEWLINE_CLEANUP_RE.sub(' ', id)
+ if id not in self.md.references: # ignore undefined refs
+ return None, m.start(0), end
+
+ href, title = self.md.references[id]
+
+ return self.makeTag(href, title, text), m.start(0), end
+
+ def evalId(self, data, index, text):
+ """
+ Evaluate the id portion of [ref][id].
+
+ If [ref][] use [ref].
+ """
+ m = self.RE_LINK.match(data, pos=index)
+ if not m:
+ return None, index, False
+ else:
+ id = m.group(1).lower()
+ end = m.end(0)
+ if not id:
+ id = text.lower()
+ return id, end, True
def makeTag(self, href, title, text):
- el = markdown.etree.Element('a')
+ el = etree.Element('a')
- el.set('href', self.sanitize_url(href))
+ el.set('href', href)
if title:
el.set('title', title)
@@ -323,49 +822,65 @@ class ReferencePattern(LinkPattern):
return el
-class ImageReferencePattern (ReferencePattern):
+class ShortReferenceInlineProcessor(ReferenceInlineProcessor):
+ """Short form of reference: [google]. """
+ def evalId(self, data, index, text):
+ """Evaluate the id from of [ref] """
+
+ return text.lower(), index, True
+
+
+class ImageReferenceInlineProcessor(ReferenceInlineProcessor):
""" Match to a stored reference and return img element. """
def makeTag(self, href, title, text):
- el = markdown.etree.Element("img")
- el.set("src", self.sanitize_url(href))
+ el = etree.Element("img")
+ el.set("src", href)
if title:
el.set("title", title)
- el.set("alt", text)
+ el.set("alt", self.unescape(text))
return el
-class AutolinkPattern (Pattern):
+class ShortImageReferenceInlineProcessor(ImageReferenceInlineProcessor):
+ """ Short form of inage reference: ![ref]. """
+ def evalId(self, data, index, text):
+ """Evaluate the id from of [ref] """
+
+ return text.lower(), index, True
+
+
+class AutolinkInlineProcessor(InlineProcessor):
""" Return a link Element given an autolink (`<http://example/com>`). """
- def handleMatch(self, m):
- el = markdown.etree.Element("a")
- el.set('href', m.group(2))
- el.text = markdown.AtomicString(m.group(2))
- return el
+ def handleMatch(self, m, data):
+ el = etree.Element("a")
+ el.set('href', self.unescape(m.group(1)))
+ el.text = util.AtomicString(m.group(1))
+ return el, m.start(0), m.end(0)
+
-class AutomailPattern (Pattern):
+class AutomailInlineProcessor(InlineProcessor):
"""
Return a mailto link Element given an automail link (`<foo@example.com>`).
"""
- def handleMatch(self, m):
- el = markdown.etree.Element('a')
- email = m.group(2)
+ def handleMatch(self, m, data):
+ el = etree.Element('a')
+ email = self.unescape(m.group(1))
if email.startswith("mailto:"):
email = email[len("mailto:"):]
def codepoint2name(code):
"""Return entity definition by code, or the code if not defined."""
- entity = htmlentitydefs.codepoint2name.get(code)
+ entity = entities.codepoint2name.get(code)
if entity:
- return "%s%s;" % (markdown.AMP_SUBSTITUTE, entity)
+ return "{}{};".format(util.AMP_SUBSTITUTE, entity)
else:
- return "%s#%d;" % (markdown.AMP_SUBSTITUTE, code)
+ return "%s#%d;" % (util.AMP_SUBSTITUTE, code)
letters = [codepoint2name(ord(letter)) for letter in email]
- el.text = markdown.AtomicString(''.join(letters))
+ el.text = util.AtomicString(''.join(letters))
mailto = "mailto:" + email
- mailto = "".join([markdown.AMP_SUBSTITUTE + '#%d;' %
+ mailto = "".join([util.AMP_SUBSTITUTE + '#%d;' %
ord(letter) for letter in mailto])
el.set('href', mailto)
- return el
-
+ return el, m.start(0), m.end(0)
diff --git a/markdown/odict.py b/markdown/odict.py
deleted file mode 100644
index bf3ef07..0000000
--- a/markdown/odict.py
+++ /dev/null
@@ -1,162 +0,0 @@
-class OrderedDict(dict):
- """
- A dictionary that keeps its keys in the order in which they're inserted.
-
- Copied from Django's SortedDict with some modifications.
-
- """
- def __new__(cls, *args, **kwargs):
- instance = super(OrderedDict, cls).__new__(cls, *args, **kwargs)
- instance.keyOrder = []
- return instance
-
- def __init__(self, data=None):
- if data is None:
- data = {}
- super(OrderedDict, self).__init__(data)
- if isinstance(data, dict):
- self.keyOrder = data.keys()
- else:
- self.keyOrder = []
- for key, value in data:
- if key not in self.keyOrder:
- self.keyOrder.append(key)
-
- def __deepcopy__(self, memo):
- from copy import deepcopy
- return self.__class__([(key, deepcopy(value, memo))
- for key, value in self.iteritems()])
-
- def __setitem__(self, key, value):
- super(OrderedDict, self).__setitem__(key, value)
- if key not in self.keyOrder:
- self.keyOrder.append(key)
-
- def __delitem__(self, key):
- super(OrderedDict, self).__delitem__(key)
- self.keyOrder.remove(key)
-
- def __iter__(self):
- for k in self.keyOrder:
- yield k
-
- def pop(self, k, *args):
- result = super(OrderedDict, self).pop(k, *args)
- try:
- self.keyOrder.remove(k)
- except ValueError:
- # Key wasn't in the dictionary in the first place. No problem.
- pass
- return result
-
- def popitem(self):
- result = super(OrderedDict, self).popitem()
- self.keyOrder.remove(result[0])
- return result
-
- def items(self):
- return zip(self.keyOrder, self.values())
-
- def iteritems(self):
- for key in self.keyOrder:
- yield key, super(OrderedDict, self).__getitem__(key)
-
- def keys(self):
- return self.keyOrder[:]
-
- def iterkeys(self):
- return iter(self.keyOrder)
-
- def values(self):
- return [super(OrderedDict, self).__getitem__(k) for k in self.keyOrder]
-
- def itervalues(self):
- for key in self.keyOrder:
- yield super(OrderedDict, self).__getitem__(key)
-
- def update(self, dict_):
- for k, v in dict_.items():
- self.__setitem__(k, v)
-
- def setdefault(self, key, default):
- if key not in self.keyOrder:
- self.keyOrder.append(key)
- return super(OrderedDict, self).setdefault(key, default)
-
- def value_for_index(self, index):
- """Return the value of the item at the given zero-based index."""
- return self[self.keyOrder[index]]
-
- def insert(self, index, key, value):
- """Insert the key, value pair before the item with the given index."""
- if key in self.keyOrder:
- n = self.keyOrder.index(key)
- del self.keyOrder[n]
- if n < index:
- index -= 1
- self.keyOrder.insert(index, key)
- super(OrderedDict, self).__setitem__(key, value)
-
- def copy(self):
- """Return a copy of this object."""
- # This way of initializing the copy means it works for subclasses, too.
- obj = self.__class__(self)
- obj.keyOrder = self.keyOrder[:]
- return obj
-
- def __repr__(self):
- """
- Replace the normal dict.__repr__ with a version that returns the keys
- in their sorted order.
- """
- return '{%s}' % ', '.join(['%r: %r' % (k, v) for k, v in self.items()])
-
- def clear(self):
- super(OrderedDict, self).clear()
- self.keyOrder = []
-
- def index(self, key):
- """ Return the index of a given key. """
- return self.keyOrder.index(key)
-
- def index_for_location(self, location):
- """ Return index or None for a given location. """
- if location == '_begin':
- i = 0
- elif location == '_end':
- i = None
- elif location.startswith('<') or location.startswith('>'):
- i = self.index(location[1:])
- if location.startswith('>'):
- if i >= len(self):
- # last item
- i = None
- else:
- i += 1
- else:
- raise ValueError('Not a valid location: "%s". Location key '
- 'must start with a ">" or "<".' % location)
- return i
-
- def add(self, key, value, location):
- """ Insert by key location. """
- i = self.index_for_location(location)
- if i is not None:
- self.insert(i, key, value)
- else:
- self.__setitem__(key, value)
-
- def link(self, key, location):
- """ Change location of an existing item. """
- n = self.keyOrder.index(key)
- del self.keyOrder[n]
- i = self.index_for_location(location)
- try:
- if i is not None:
- self.keyOrder.insert(i, key)
- else:
- self.keyOrder.append(key)
- except Error:
- # restore to prevent data loss and reraise
- self.keyOrder.insert(n, key)
- raise Error
diff --git a/markdown/postprocessors.py b/markdown/postprocessors.py
index 80227bb..498f7e8 100644
--- a/markdown/postprocessors.py
+++ b/markdown/postprocessors.py
@@ -1,4 +1,23 @@
"""
+Python Markdown
+
+A Python implementation of John Gruber's Markdown.
+
+Documentation: https://python-markdown.github.io/
+GitHub: https://github.com/Python-Markdown/markdown/
+PyPI: https://pypi.org/project/Markdown/
+
+Started by Manfred Stienstra (http://www.dwerg.net/).
+Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
+Currently maintained by Waylan Limberg (https://github.com/waylan),
+Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
+
+Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later)
+Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
+Copyright 2004 Manfred Stienstra (the original version)
+
+License: BSD (see LICENSE.md for details).
+
POST-PROCESSORS
=============================================================================
@@ -8,15 +27,20 @@ processing.
"""
+from collections import OrderedDict
+from . import util
+import re
+
-import markdown
+def build_postprocessors(md, **kwargs):
+ """ Build the default postprocessors for Markdown. """
+ postprocessors = util.Registry()
+ postprocessors.register(RawHtmlPostprocessor(md), 'raw_html', 30)
+ postprocessors.register(AndSubstitutePostprocessor(), 'amp_substitute', 20)
+ return postprocessors
-class Processor:
- def __init__(self, markdown_instance=None):
- if markdown_instance:
- self.markdown = markdown_instance
-class Postprocessor(Processor):
+class Postprocessor(util.Processor):
"""
Postprocessors are run after the ElementTree it converted back into text.
@@ -34,44 +58,80 @@ class Postprocessor(Processor):
(possibly modified) string.
"""
- pass
+ pass # pragma: no cover
class RawHtmlPostprocessor(Postprocessor):
""" Restore raw html to the document. """
+ BLOCK_LEVEL_REGEX = re.compile(r'^\<\/?([^ >]+)')
+
def run(self, text):
- """ Iterate over html stash and restore "safe" html. """
- for i in range(self.markdown.htmlStash.html_counter):
- html, safe = self.markdown.htmlStash.rawHtmlBlocks[i]
- if self.markdown.safeMode and not safe:
- if str(self.markdown.safeMode).lower() == 'escape':
- html = self.escape(html)
- elif str(self.markdown.safeMode).lower() == 'remove':
- html = ''
+ """ Iterate over html stash and restore html. """
+ replacements = OrderedDict()
+ for i in range(self.md.htmlStash.html_counter):
+ html = self.stash_to_string(self.md.htmlStash.rawHtmlBlocks[i])
+ if self.isblocklevel(html):
+ replacements["<p>{}</p>".format(
+ self.md.htmlStash.get_placeholder(i))] = html
+ replacements[self.md.htmlStash.get_placeholder(i)] = html
+
+ def substitute_match(m):
+ key = m.group(0)
+
+ if key not in replacements:
+ if key[3:-4] in replacements:
+ return f'<p>{ replacements[key[3:-4]] }</p>'
else:
- html = markdown.HTML_REMOVED_TEXT
- if safe or not self.markdown.safeMode:
- text = text.replace("<p>%s</p>" %
- (markdown.preprocessors.HTML_PLACEHOLDER % i),
- html + "\n")
- text = text.replace(markdown.preprocessors.HTML_PLACEHOLDER % i,
- html)
- return text
+ return key
+
+ return replacements[key]
- def escape(self, html):
- """ Basic html escaping """
- html = html.replace('&', '&amp;')
- html = html.replace('<', '&lt;')
- html = html.replace('>', '&gt;')
- return html.replace('"', '&quot;')
+ if replacements:
+ base_placeholder = util.HTML_PLACEHOLDER % r'([0-9]+)'
+ pattern = re.compile(f'<p>{ base_placeholder }</p>|{ base_placeholder }')
+ processed_text = pattern.sub(substitute_match, text)
+ else:
+ return text
+
+ if processed_text == text:
+ return processed_text
+ else:
+ return self.run(processed_text)
+
+ def isblocklevel(self, html):
+ m = self.BLOCK_LEVEL_REGEX.match(html)
+ if m:
+ if m.group(1)[0] in ('!', '?', '@', '%'):
+ # Comment, php etc...
+ return True
+ return self.md.is_block_level(m.group(1))
+ return False
+
+ def stash_to_string(self, text):
+ """ Convert a stashed object to a string. """
+ return str(text)
class AndSubstitutePostprocessor(Postprocessor):
""" Restore valid entities """
- def __init__(self):
- pass
def run(self, text):
- text = text.replace(markdown.AMP_SUBSTITUTE, "&")
+ text = text.replace(util.AMP_SUBSTITUTE, "&")
return text
+
+
+@util.deprecated(
+ "This class will be removed in the future; "
+ "use 'treeprocessors.UnescapeTreeprocessor' instead."
+)
+class UnescapePostprocessor(Postprocessor):
+ """ Restore escaped chars """
+
+ RE = re.compile(r'{}(\d+){}'.format(util.STX, util.ETX))
+
+ def unescape(self, m):
+ return chr(int(m.group(1)))
+
+ def run(self, text):
+ return self.RE.sub(self.unescape, text)
diff --git a/markdown/preprocessors.py b/markdown/preprocessors.py
index ef04cab..e1023c5 100644
--- a/markdown/preprocessors.py
+++ b/markdown/preprocessors.py
@@ -1,24 +1,44 @@
-
"""
+Python Markdown
+
+A Python implementation of John Gruber's Markdown.
+
+Documentation: https://python-markdown.github.io/
+GitHub: https://github.com/Python-Markdown/markdown/
+PyPI: https://pypi.org/project/Markdown/
+
+Started by Manfred Stienstra (http://www.dwerg.net/).
+Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
+Currently maintained by Waylan Limberg (https://github.com/waylan),
+Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
+
+Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later)
+Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
+Copyright 2004 Manfred Stienstra (the original version)
+
+License: BSD (see LICENSE.md for details).
+
PRE-PROCESSORS
=============================================================================
Preprocessors work on source text before we start doing anything too
-complicated.
+complicated.
"""
+from . import util
+from .htmlparser import HTMLExtractor
import re
-import markdown
-HTML_PLACEHOLDER_PREFIX = markdown.STX+"wzxhzdk:"
-HTML_PLACEHOLDER = HTML_PLACEHOLDER_PREFIX + "%d" + markdown.ETX
-class Processor:
- def __init__(self, markdown_instance=None):
- if markdown_instance:
- self.markdown = markdown_instance
+def build_preprocessors(md, **kwargs):
+ """ Build the default set of preprocessors used by Markdown. """
+ preprocessors = util.Registry()
+ preprocessors.register(NormalizeWhitespace(md), 'normalize_whitespace', 30)
+ preprocessors.register(HtmlBlockPreprocessor(md), 'html_block', 20)
+ return preprocessors
+
-class Preprocessor (Processor):
+class Preprocessor(util.Processor):
"""
Preprocessors are run after the text is broken into lines.
@@ -36,180 +56,27 @@ class Preprocessor (Processor):
the (possibly modified) list of lines.
"""
- pass
+ pass # pragma: no cover
-class HtmlStash:
- """
- This class is used for stashing HTML objects that we extract
- in the beginning and replace with place-holders.
- """
- def __init__ (self):
- """ Create a HtmlStash. """
- self.html_counter = 0 # for counting inline html segments
- self.rawHtmlBlocks=[]
+class NormalizeWhitespace(Preprocessor):
+ """ Normalize whitespace for consistent parsing. """
- def store(self, html, safe=False):
- """
- Saves an HTML segment for later reinsertion. Returns a
- placeholder string that needs to be inserted into the
- document.
-
- Keyword arguments:
-
- * html: an html segment
- * safe: label an html segment as safe for safemode
-
- Returns : a placeholder string
-
- """
- self.rawHtmlBlocks.append((html, safe))
- placeholder = HTML_PLACEHOLDER % self.html_counter
- self.html_counter += 1
- return placeholder
-
- def reset(self):
- self.html_counter = 0
- self.rawHtmlBlocks = []
+ def run(self, lines):
+ source = '\n'.join(lines)
+ source = source.replace(util.STX, "").replace(util.ETX, "")
+ source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n"
+ source = source.expandtabs(self.md.tab_length)
+ source = re.sub(r'(?<=\n) +\n', '\n', source)
+ return source.split('\n')
class HtmlBlockPreprocessor(Preprocessor):
"""Remove html blocks from the text and store them for later retrieval."""
- right_tag_patterns = ["</%s>", "%s>"]
-
- def _get_left_tag(self, block):
- return block[1:].replace(">", " ", 1).split()[0].lower()
-
- def _get_right_tag(self, left_tag, block):
- for p in self.right_tag_patterns:
- tag = p % left_tag
- i = block.rfind(tag)
- if i > 2:
- return tag.lstrip("<").rstrip(">"), i + len(p)-2 + len(left_tag)
- return block.rstrip()[-len(left_tag)-2:-1].lower(), len(block)
-
- def _equal_tags(self, left_tag, right_tag):
- if left_tag == 'div' or left_tag[0] in ['?', '@', '%']: # handle PHP, etc.
- return True
- if ("/" + left_tag) == right_tag:
- return True
- if (right_tag == "--" and left_tag == "--"):
- return True
- elif left_tag == right_tag[1:] \
- and right_tag[0] != "<":
- return True
- else:
- return False
-
- def _is_oneliner(self, tag):
- return (tag in ['hr', 'hr/'])
-
def run(self, lines):
- text = "\n".join(lines)
- new_blocks = []
- text = text.split("\n\n")
- items = []
- left_tag = ''
- right_tag = ''
- in_tag = False # flag
-
- while text:
- block = text[0]
- if block.startswith("\n"):
- block = block[1:]
- text = text[1:]
-
- if block.startswith("\n"):
- block = block[1:]
-
- if not in_tag:
- if block.startswith("<"):
- left_tag = self._get_left_tag(block)
- right_tag, data_index = self._get_right_tag(left_tag, block)
-
- if block[1] == "!":
- # is a comment block
- left_tag = "--"
- right_tag, data_index = self._get_right_tag(left_tag, block)
- # keep checking conditions below and maybe just append
-
- if data_index < len(block) \
- and markdown.isBlockLevel(left_tag):
- text.insert(0, block[data_index:])
- block = block[:data_index]
-
- if not (markdown.isBlockLevel(left_tag) \
- or block[1] in ["!", "?", "@", "%"]):
- new_blocks.append(block)
- continue
-
- if self._is_oneliner(left_tag):
- new_blocks.append(block.strip())
- continue
-
- if block.rstrip().endswith(">") \
- and self._equal_tags(left_tag, right_tag):
- new_blocks.append(
- self.markdown.htmlStash.store(block.strip()))
- continue
- else: #if not block[1] == "!":
- # if is block level tag and is not complete
-
- if markdown.isBlockLevel(left_tag) or left_tag == "--" \
- and not block.rstrip().endswith(">"):
- items.append(block.strip())
- in_tag = True
- else:
- new_blocks.append(
- self.markdown.htmlStash.store(block.strip()))
-
- continue
-
- new_blocks.append(block)
-
- else:
- items.append(block.strip())
-
- right_tag, data_index = self._get_right_tag(left_tag, block)
-
- if self._equal_tags(left_tag, right_tag):
- # if find closing tag
- in_tag = False
- new_blocks.append(
- self.markdown.htmlStash.store('\n\n'.join(items)))
- items = []
-
- if items:
- new_blocks.append(self.markdown.htmlStash.store('\n\n'.join(items)))
- new_blocks.append('\n')
-
- new_text = "\n\n".join(new_blocks)
- return new_text.split("\n")
-
-
-class ReferencePreprocessor(Preprocessor):
- """ Remove reference definitions from text and store for later use. """
-
- RE = re.compile(r'^(\ ?\ ?\ ?)\[([^\]]*)\]:\s*([^ ]*)(.*)$', re.DOTALL)
-
- def run (self, lines):
- new_text = [];
- for line in lines:
- m = self.RE.match(line)
- if m:
- id = m.group(2).strip().lower()
- t = m.group(4).strip() # potential title
- if not t:
- self.markdown.references[id] = (m.group(3), t)
- elif (len(t) >= 2
- and (t[0] == t[-1] == "\""
- or t[0] == t[-1] == "\'"
- or (t[0] == "(" and t[-1] == ")") ) ):
- self.markdown.references[id] = (m.group(3), t[1:-1])
- else:
- new_text.append(line)
- else:
- new_text.append(line)
-
- return new_text #+ "\n"
+ source = '\n'.join(lines)
+ parser = HTMLExtractor(self.md)
+ parser.feed(source)
+ parser.close()
+ return ''.join(parser.cleandoc).split('\n')
diff --git a/markdown/serializers.py b/markdown/serializers.py
new file mode 100644
index 0000000..59bab18
--- /dev/null
+++ b/markdown/serializers.py
@@ -0,0 +1,189 @@
+# markdown/searializers.py
+#
+# Add x/html serialization to Elementree
+# Taken from ElementTree 1.3 preview with slight modifications
+#
+# Copyright (c) 1999-2007 by Fredrik Lundh. All rights reserved.
+#
+# fredrik@pythonware.com
+# https://www.pythonware.com/
+#
+# --------------------------------------------------------------------
+# The ElementTree toolkit is
+#
+# Copyright (c) 1999-2007 by Fredrik Lundh
+#
+# By obtaining, using, and/or copying this software and/or its
+# associated documentation, you agree that you have read, understood,
+# and will comply with the following terms and conditions:
+#
+# Permission to use, copy, modify, and distribute this software and
+# its associated documentation for any purpose and without fee is
+# hereby granted, provided that the above copyright notice appears in
+# all copies, and that both that copyright notice and this permission
+# notice appear in supporting documentation, and that the name of
+# Secret Labs AB or the author not be used in advertising or publicity
+# pertaining to distribution of the software without specific, written
+# prior permission.
+#
+# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
+# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
+# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
+# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
+# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+# OF THIS SOFTWARE.
+# --------------------------------------------------------------------
+
+
+from xml.etree.ElementTree import ProcessingInstruction
+from xml.etree.ElementTree import Comment, ElementTree, QName
+import re
+
+__all__ = ['to_html_string', 'to_xhtml_string']
+
+HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",
+ "img", "input", "isindex", "link", "meta", "param")
+RE_AMP = re.compile(r'&(?!(?:\#[0-9]+|\#x[0-9a-f]+|[0-9a-z]+);)', re.I)
+
+try:
+ HTML_EMPTY = set(HTML_EMPTY)
+except NameError: # pragma: no cover
+ pass
+
+
+def _raise_serialization_error(text): # pragma: no cover
+ raise TypeError(
+ "cannot serialize {!r} (type {})".format(text, type(text).__name__)
+ )
+
+
+def _escape_cdata(text):
+ # escape character data
+ try:
+ # it's worth avoiding do-nothing calls for strings that are
+ # shorter than 500 character, or so. assume that's, by far,
+ # the most common case in most applications.
+ if "&" in text:
+ # Only replace & when not part of an entity
+ text = RE_AMP.sub('&amp;', text)
+ if "<" in text:
+ text = text.replace("<", "&lt;")
+ if ">" in text:
+ text = text.replace(">", "&gt;")
+ return text
+ except (TypeError, AttributeError): # pragma: no cover
+ _raise_serialization_error(text)
+
+
+def _escape_attrib(text):
+ # escape attribute value
+ try:
+ if "&" in text:
+ # Only replace & when not part of an entity
+ text = RE_AMP.sub('&amp;', text)
+ if "<" in text:
+ text = text.replace("<", "&lt;")
+ if ">" in text:
+ text = text.replace(">", "&gt;")
+ if "\"" in text:
+ text = text.replace("\"", "&quot;")
+ if "\n" in text:
+ text = text.replace("\n", "&#10;")
+ return text
+ except (TypeError, AttributeError): # pragma: no cover
+ _raise_serialization_error(text)
+
+
+def _escape_attrib_html(text):
+ # escape attribute value
+ try:
+ if "&" in text:
+ # Only replace & when not part of an entity
+ text = RE_AMP.sub('&amp;', text)
+ if "<" in text:
+ text = text.replace("<", "&lt;")
+ if ">" in text:
+ text = text.replace(">", "&gt;")
+ if "\"" in text:
+ text = text.replace("\"", "&quot;")
+ return text
+ except (TypeError, AttributeError): # pragma: no cover
+ _raise_serialization_error(text)
+
+
+def _serialize_html(write, elem, format):
+ tag = elem.tag
+ text = elem.text
+ if tag is Comment:
+ write("<!--%s-->" % _escape_cdata(text))
+ elif tag is ProcessingInstruction:
+ write("<?%s?>" % _escape_cdata(text))
+ elif tag is None:
+ if text:
+ write(_escape_cdata(text))
+ for e in elem:
+ _serialize_html(write, e, format)
+ else:
+ namespace_uri = None
+ if isinstance(tag, QName):
+ # QNAME objects store their data as a string: `{uri}tag`
+ if tag.text[:1] == "{":
+ namespace_uri, tag = tag.text[1:].split("}", 1)
+ else:
+ raise ValueError('QName objects must define a tag.')
+ write("<" + tag)
+ items = elem.items()
+ if items:
+ items = sorted(items) # lexical order
+ for k, v in items:
+ if isinstance(k, QName):
+ # Assume a text only QName
+ k = k.text
+ if isinstance(v, QName):
+ # Assume a text only QName
+ v = v.text
+ else:
+ v = _escape_attrib_html(v)
+ if k == v and format == 'html':
+ # handle boolean attributes
+ write(" %s" % v)
+ else:
+ write(' {}="{}"'.format(k, v))
+ if namespace_uri:
+ write(' xmlns="%s"' % (_escape_attrib(namespace_uri)))
+ if format == "xhtml" and tag.lower() in HTML_EMPTY:
+ write(" />")
+ else:
+ write(">")
+ if text:
+ if tag.lower() in ["script", "style"]:
+ write(text)
+ else:
+ write(_escape_cdata(text))
+ for e in elem:
+ _serialize_html(write, e, format)
+ if tag.lower() not in HTML_EMPTY:
+ write("</" + tag + ">")
+ if elem.tail:
+ write(_escape_cdata(elem.tail))
+
+
+def _write_html(root, format="html"):
+ assert root is not None
+ data = []
+ write = data.append
+ _serialize_html(write, root, format)
+ return "".join(data)
+
+
+# --------------------------------------------------------------------
+# public functions
+
+def to_html_string(element):
+ return _write_html(ElementTree(element).getroot(), format="html")
+
+
+def to_xhtml_string(element):
+ return _write_html(ElementTree(element).getroot(), format="xhtml")
diff --git a/markdown/test_tools.py b/markdown/test_tools.py
new file mode 100644
index 0000000..2ce0e74
--- /dev/null
+++ b/markdown/test_tools.py
@@ -0,0 +1,220 @@
+"""
+Python Markdown
+
+A Python implementation of John Gruber's Markdown.
+
+Documentation: https://python-markdown.github.io/
+GitHub: https://github.com/Python-Markdown/markdown/
+PyPI: https://pypi.org/project/Markdown/
+
+Started by Manfred Stienstra (http://www.dwerg.net/).
+Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
+Currently maintained by Waylan Limberg (https://github.com/waylan),
+Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
+
+Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later)
+Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
+Copyright 2004 Manfred Stienstra (the original version)
+
+License: BSD (see LICENSE.md for details).
+"""
+
+import os
+import sys
+import unittest
+import textwrap
+from . import markdown, Markdown, util
+
+try:
+ import tidylib
+except ImportError:
+ tidylib = None
+
+__all__ = ['TestCase', 'LegacyTestCase', 'Kwargs']
+
+
+class TestCase(unittest.TestCase):
+ """
+ A unittest.TestCase subclass with helpers for testing Markdown output.
+
+ Define `default_kwargs` as a dict of keywords to pass to Markdown for each
+ test. The defaults can be overridden on individual tests.
+
+ The `assertMarkdownRenders` method accepts the source text, the expected
+ output, and any keywords to pass to Markdown. The `default_kwargs` are used
+ except where overridden by `kwargs`. The output and expected output are passed
+ to `TestCase.assertMultiLineEqual`. An AssertionError is raised with a diff
+ if the actual output does not equal the expected output.
+
+ The `dedent` method is available to dedent triple-quoted strings if
+ necessary.
+
+ In all other respects, behaves as unittest.TestCase.
+ """
+
+ default_kwargs = {}
+
+ def assertMarkdownRenders(self, source, expected, expected_attrs=None, **kwargs):
+ """
+ Test that source Markdown text renders to expected output with given keywords.
+
+ `expected_attrs` accepts a dict. Each key should be the name of an attribute
+ on the `Markdown` instance and the value should be the expected value after
+ the source text is parsed by Markdown. After the expected output is tested,
+ the expected value for each attribute is compared against the actual
+ attribute of the `Markdown` instance using `TestCase.assertEqual`.
+ """
+
+ expected_attrs = expected_attrs or {}
+ kws = self.default_kwargs.copy()
+ kws.update(kwargs)
+ md = Markdown(**kws)
+ output = md.convert(source)
+ self.assertMultiLineEqual(output, expected)
+ for key, value in expected_attrs.items():
+ self.assertEqual(getattr(md, key), value)
+
+ def dedent(self, text):
+ """
+ Dedent text.
+ """
+
+ # TODO: If/when actual output ends with a newline, then use:
+ # return textwrap.dedent(text.strip('/n'))
+ return textwrap.dedent(text).strip()
+
+
+class recursionlimit:
+ """
+ A context manager which temporarily modifies the Python recursion limit.
+
+ The testing framework, coverage, etc. may add an arbitrary number of levels to the depth. To maintain consistency
+ in the tests, the current stack depth is determined when called, then added to the provided limit.
+
+ Example usage:
+
+ with recursionlimit(20):
+ # test code here
+
+ See https://stackoverflow.com/a/50120316/866026
+ """
+
+ def __init__(self, limit):
+ self.limit = util._get_stack_depth() + limit
+ self.old_limit = sys.getrecursionlimit()
+
+ def __enter__(self):
+ sys.setrecursionlimit(self.limit)
+
+ def __exit__(self, type, value, tb):
+ sys.setrecursionlimit(self.old_limit)
+
+
+#########################
+# Legacy Test Framework #
+#########################
+
+
+class Kwargs(dict):
+ """ A dict like class for holding keyword arguments. """
+ pass
+
+
+def _normalize_whitespace(text):
+ """ Normalize whitespace for a string of html using tidylib. """
+ output, errors = tidylib.tidy_fragment(text, options={
+ 'drop_empty_paras': 0,
+ 'fix_backslash': 0,
+ 'fix_bad_comments': 0,
+ 'fix_uri': 0,
+ 'join_styles': 0,
+ 'lower_literals': 0,
+ 'merge_divs': 0,
+ 'output_xhtml': 1,
+ 'quote_ampersand': 0,
+ 'newline': 'LF'
+ })
+ return output
+
+
+class LegacyTestMeta(type):
+ def __new__(cls, name, bases, dct):
+
+ def generate_test(infile, outfile, normalize, kwargs):
+ def test(self):
+ with open(infile, encoding="utf-8") as f:
+ input = f.read()
+ with open(outfile, encoding="utf-8") as f:
+ # Normalize line endings
+ # (on Windows, git may have altered line endings).
+ expected = f.read().replace("\r\n", "\n")
+ output = markdown(input, **kwargs)
+ if tidylib and normalize:
+ try:
+ expected = _normalize_whitespace(expected)
+ output = _normalize_whitespace(output)
+ except OSError:
+ self.skipTest("Tidylib's c library not available.")
+ elif normalize:
+ self.skipTest('Tidylib not available.')
+ self.assertMultiLineEqual(output, expected)
+ return test
+
+ location = dct.get('location', '')
+ exclude = dct.get('exclude', [])
+ normalize = dct.get('normalize', False)
+ input_ext = dct.get('input_ext', '.txt')
+ output_ext = dct.get('output_ext', '.html')
+ kwargs = dct.get('default_kwargs', Kwargs())
+
+ if os.path.isdir(location):
+ for file in os.listdir(location):
+ infile = os.path.join(location, file)
+ if os.path.isfile(infile):
+ tname, ext = os.path.splitext(file)
+ if ext == input_ext:
+ outfile = os.path.join(location, tname + output_ext)
+ tname = tname.replace(' ', '_').replace('-', '_')
+ kws = kwargs.copy()
+ if tname in dct:
+ kws.update(dct[tname])
+ test_name = 'test_%s' % tname
+ if tname not in exclude:
+ dct[test_name] = generate_test(infile, outfile, normalize, kws)
+ else:
+ dct[test_name] = unittest.skip('Excluded')(lambda: None)
+
+ return type.__new__(cls, name, bases, dct)
+
+
+class LegacyTestCase(unittest.TestCase, metaclass=LegacyTestMeta):
+ """
+ A `unittest.TestCase` subclass for running Markdown's legacy file-based tests.
+
+ A subclass should define various properties which point to a directory of
+ text-based test files and define various behaviors/defaults for those tests.
+ The following properties are supported:
+
+ location: A path to the directory of test files. An absolute path is preferred.
+ exclude: A list of tests to exclude. Each test name should comprise the filename
+ without an extension.
+ normalize: A boolean value indicating if the HTML should be normalized.
+ Default: `False`.
+ input_ext: A string containing the file extension of input files. Default: `.txt`.
+ ouput_ext: A string containing the file extension of expected output files.
+ Default: `html`.
+ default_kwargs: A `Kwargs` instance which stores the default set of keyword
+ arguments for all test files in the directory.
+
+ In addition, properties can be defined for each individual set of test files within
+ the directory. The property should be given the name of the file without the file
+ extension. Any spaces and dashes in the filename should be replaced with
+ underscores. The value of the property should be a `Kwargs` instance which
+ contains the keyword arguments that should be passed to `Markdown` for that
+ test file. The keyword arguments will "update" the `default_kwargs`.
+
+ When the class instance is created, it will walk the given directory and create
+ a separate unitttest for each set of test files using the naming scheme:
+ `test_filename`. One unittest will be run for each set of input and output files.
+ """
+ pass
diff --git a/markdown/treeprocessors.py b/markdown/treeprocessors.py
index 1dc612a..e9f48ca 100644
--- a/markdown/treeprocessors.py
+++ b/markdown/treeprocessors.py
@@ -1,16 +1,47 @@
-import markdown
+"""
+Python Markdown
+
+A Python implementation of John Gruber's Markdown.
+
+Documentation: https://python-markdown.github.io/
+GitHub: https://github.com/Python-Markdown/markdown/
+PyPI: https://pypi.org/project/Markdown/
+
+Started by Manfred Stienstra (http://www.dwerg.net/).
+Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
+Currently maintained by Waylan Limberg (https://github.com/waylan),
+Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
+
+Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later)
+Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
+Copyright 2004 Manfred Stienstra (the original version)
+
+License: BSD (see LICENSE.md for details).
+"""
+
import re
+import xml.etree.ElementTree as etree
+from . import util
+from . import inlinepatterns
+
+
+def build_treeprocessors(md, **kwargs):
+ """ Build the default treeprocessors for Markdown. """
+ treeprocessors = util.Registry()
+ treeprocessors.register(InlineProcessor(md), 'inline', 20)
+ treeprocessors.register(PrettifyTreeprocessor(md), 'prettify', 10)
+ treeprocessors.register(UnescapeTreeprocessor(md), 'unescape', 0)
+ return treeprocessors
+
def isString(s):
""" Check if it's string """
- return isinstance(s, unicode) or isinstance(s, str)
+ if not isinstance(s, util.AtomicString):
+ return isinstance(s, str)
+ return False
-class Processor:
- def __init__(self, markdown_instance=None):
- if markdown_instance:
- self.markdown = markdown_instance
-class Treeprocessor(Processor):
+class Treeprocessor(util.Processor):
"""
Treeprocessors are run on the ElementTree object before serialization.
@@ -24,11 +55,11 @@ class Treeprocessor(Processor):
def run(self, root):
"""
Subclasses of Treeprocessor should implement a `run` method, which
- takes a root ElementTree. This method can return another ElementTree
- object, and the existing root ElementTree will be replaced, or it can
+ takes a root ElementTree. This method can return another ElementTree
+ object, and the existing root ElementTree will be replaced, or it can
modify the current tree and return None.
"""
- pass
+ pass # pragma: no cover
class InlineProcessor(Treeprocessor):
@@ -36,18 +67,20 @@ class InlineProcessor(Treeprocessor):
A Treeprocessor that traverses a tree, applying inline patterns.
"""
- def __init__ (self, md):
- self.__placeholder_prefix = markdown.INLINE_PLACEHOLDER_PREFIX
- self.__placeholder_suffix = markdown.ETX
+ def __init__(self, md):
+ self.__placeholder_prefix = util.INLINE_PLACEHOLDER_PREFIX
+ self.__placeholder_suffix = util.ETX
self.__placeholder_length = 4 + len(self.__placeholder_prefix) \
+ len(self.__placeholder_suffix)
- self.__placeholder_re = re.compile(markdown.INLINE_PLACEHOLDER % r'([0-9]{4})')
- self.markdown = md
+ self.__placeholder_re = util.INLINE_PLACEHOLDER_RE
+ self.md = md
+ self.inlinePatterns = md.inlinePatterns
+ self.ancestors = []
def __makePlaceholder(self, type):
""" Generate a placeholder """
id = "%04d" % len(self.stashed_nodes)
- hash = markdown.INLINE_PLACEHOLDER % id
+ hash = util.INLINE_PLACEHOLDER % id
return hash, id
def __findPlaceholder(self, data, index):
@@ -60,8 +93,8 @@ class InlineProcessor(Treeprocessor):
* index: index, from which we start search
Returns: placeholder id and string index, after the found placeholder.
- """
+ """
m = self.__placeholder_re.search(data, index)
if m:
return m.group(1), m.end()
@@ -87,12 +120,13 @@ class InlineProcessor(Treeprocessor):
Returns: String with placeholders.
"""
- if not isinstance(data, markdown.AtomicString):
+ if not isinstance(data, util.AtomicString):
startIndex = 0
- while patternIndex < len(self.markdown.inlinePatterns):
+ count = len(self.inlinePatterns)
+ while patternIndex < count:
data, matched, startIndex = self.__applyPattern(
- self.markdown.inlinePatterns.value_for_index(patternIndex),
- data, patternIndex, startIndex)
+ self.inlinePatterns[patternIndex], data, patternIndex, startIndex
+ )
if not matched:
patternIndex += 1
return data
@@ -118,19 +152,18 @@ class InlineProcessor(Treeprocessor):
text = subnode.tail
subnode.tail = None
- childResult = self.__processPlaceholders(text, subnode)
+ childResult = self.__processPlaceholders(text, subnode, isText)
if not isText and node is not subnode:
- pos = node.getchildren().index(subnode)
- node.remove(subnode)
+ pos = list(node).index(subnode) + 1
else:
pos = 0
childResult.reverse()
for newChild in childResult:
- node.insert(pos, newChild)
+ node.insert(pos, newChild[0])
- def __processPlaceholders(self, data, parent):
+ def __processPlaceholders(self, data, parent, isText=True):
"""
Process string with placeholders and generate ElementTree tree.
@@ -140,20 +173,25 @@ class InlineProcessor(Treeprocessor):
* parent: Element, which contains processing inline data
Returns: list with ElementTree elements with applied inline patterns.
+
"""
def linkText(text):
if text:
if result:
- if result[-1].tail:
- result[-1].tail += text
+ if result[-1][0].tail:
+ result[-1][0].tail += text
+ else:
+ result[-1][0].tail = text
+ elif not isText:
+ if parent.tail:
+ parent.tail += text
else:
- result[-1].tail = text
+ parent.tail = text
else:
if parent.text:
parent.text += text
else:
parent.text = text
-
result = []
strartIndex = 0
while data:
@@ -168,28 +206,33 @@ class InlineProcessor(Treeprocessor):
text = data[strartIndex:index]
linkText(text)
- if not isString(node): # it's Element
- for child in [node] + node.getchildren():
+ if not isString(node): # it's Element
+ for child in [node] + list(node):
if child.tail:
if child.tail.strip():
- self.__processElementText(node, child, False)
+ self.__processElementText(
+ node, child, False
+ )
if child.text:
if child.text.strip():
self.__processElementText(child, child)
- else: # it's just a string
+ else: # it's just a string
linkText(node)
strartIndex = phEndIndex
continue
strartIndex = phEndIndex
- result.append(node)
+ result.append((node, self.ancestors[:]))
- else: # wrong placeholder
- end = index + len(prefix)
+ else: # wrong placeholder
+ end = index + len(self.__placeholder_prefix)
linkText(data[strartIndex:end])
strartIndex = end
else:
text = data[strartIndex:]
+ if isinstance(data, util.AtomicString):
+ # We don't want to loose the AtomicString
+ text = util.AtomicString(text)
linkText(text)
data = ""
@@ -205,94 +248,149 @@ class InlineProcessor(Treeprocessor):
* data: the text to be processed
* pattern: the pattern to be checked
* patternIndex: index of current pattern
- * startIndex: string index, from which we starting search
+ * startIndex: string index, from which we start searching
Returns: String with placeholders instead of ElementTree elements.
"""
- match = pattern.getCompiledRegExp().match(data[startIndex:])
- leftData = data[:startIndex]
+ new_style = isinstance(pattern, inlinepatterns.InlineProcessor)
+
+ for exclude in pattern.ANCESTOR_EXCLUDES:
+ if exclude.lower() in self.ancestors:
+ return data, False, 0
+
+ if new_style:
+ match = None
+ # Since handleMatch may reject our first match,
+ # we iterate over the buffer looking for matches
+ # until we can't find any more.
+ for match in pattern.getCompiledRegExp().finditer(data, startIndex):
+ node, start, end = pattern.handleMatch(match, data)
+ if start is None or end is None:
+ startIndex += match.end(0)
+ match = None
+ continue
+ break
+ else: # pragma: no cover
+ match = pattern.getCompiledRegExp().match(data[startIndex:])
+ leftData = data[:startIndex]
if not match:
return data, False, 0
- node = pattern.handleMatch(match)
+ if not new_style: # pragma: no cover
+ node = pattern.handleMatch(match)
+ start = match.start(0)
+ end = match.end(0)
if node is None:
- return data, True, len(leftData) + match.span(len(match.groups()))[0]
+ return data, True, end
if not isString(node):
- if not isinstance(node.text, markdown.AtomicString):
+ if not isinstance(node.text, util.AtomicString):
# We need to process current node too
- for child in [node] + node.getchildren():
+ for child in [node] + list(node):
if not isString(node):
if child.text:
- child.text = self.__handleInline(child.text,
- patternIndex + 1)
+ self.ancestors.append(child.tag.lower())
+ child.text = self.__handleInline(
+ child.text, patternIndex + 1
+ )
+ self.ancestors.pop()
if child.tail:
- child.tail = self.__handleInline(child.tail,
- patternIndex)
+ child.tail = self.__handleInline(
+ child.tail, patternIndex
+ )
placeholder = self.__stashNode(node, pattern.type())
- return "%s%s%s%s" % (leftData,
- match.group(1),
- placeholder, match.groups()[-1]), True, 0
-
- def run(self, tree):
+ if new_style:
+ return "{}{}{}".format(data[:start],
+ placeholder, data[end:]), True, 0
+ else: # pragma: no cover
+ return "{}{}{}{}".format(leftData,
+ match.group(1),
+ placeholder, match.groups()[-1]), True, 0
+
+ def __build_ancestors(self, parent, parents):
+ """Build the ancestor list."""
+ ancestors = []
+ while parent is not None:
+ if parent is not None:
+ ancestors.append(parent.tag.lower())
+ parent = self.parent_map.get(parent)
+ ancestors.reverse()
+ parents.extend(ancestors)
+
+ def run(self, tree, ancestors=None):
"""Apply inline patterns to a parsed Markdown tree.
Iterate over ElementTree, find elements with inline tag, apply inline
patterns and append newly created Elements to tree. If you don't
- want process your data with inline paterns, instead of normal string,
- use subclass AtomicString:
+ want to process your data with inline patterns, instead of normal
+ string, use subclass AtomicString:
- node.text = markdown.AtomicString("data won't be processed with inline patterns")
+ node.text = markdown.AtomicString("This will not be processed.")
Arguments:
- * markdownTree: ElementTree object, representing Markdown tree.
+ * tree: ElementTree object, representing Markdown tree.
+ * ancestors: List of parent tag names that precede the tree node (if needed).
Returns: ElementTree object with applied inline patterns.
"""
self.stashed_nodes = {}
- stack = [tree]
+ # Ensure a valid parent list, but copy passed in lists
+ # to ensure we don't have the user accidentally change it on us.
+ tree_parents = [] if ancestors is None else ancestors[:]
+
+ self.parent_map = {c: p for p in tree.iter() for c in p}
+ stack = [(tree, tree_parents)]
while stack:
- currElement = stack.pop()
+ currElement, parents = stack.pop()
+
+ self.ancestors = parents
+ self.__build_ancestors(currElement, self.ancestors)
+
insertQueue = []
- for child in currElement.getchildren():
- if child.text and not isinstance(child.text, markdown.AtomicString):
+ for child in currElement:
+ if child.text and not isinstance(
+ child.text, util.AtomicString
+ ):
+ self.ancestors.append(child.tag.lower())
text = child.text
child.text = None
- lst = self.__processPlaceholders(self.__handleInline(
- text), child)
+ lst = self.__processPlaceholders(
+ self.__handleInline(text), child
+ )
+ for item in lst:
+ self.parent_map[item[0]] = child
stack += lst
insertQueue.append((child, lst))
-
- if child.getchildren():
- stack.append(child)
+ self.ancestors.pop()
+ if child.tail:
+ tail = self.__handleInline(child.tail)
+ dumby = etree.Element('d')
+ child.tail = None
+ tailResult = self.__processPlaceholders(tail, dumby, False)
+ if dumby.tail:
+ child.tail = dumby.tail
+ pos = list(currElement).index(child) + 1
+ tailResult.reverse()
+ for newChild in tailResult:
+ self.parent_map[newChild[0]] = currElement
+ currElement.insert(pos, newChild[0])
+ if len(child):
+ self.parent_map[child] = currElement
+ stack.append((child, self.ancestors[:]))
for element, lst in insertQueue:
- if element.text:
- element.text = \
- markdown.inlinepatterns.handleAttributes(element.text,
- element)
- i = 0
- for newChild in lst:
- # Processing attributes
- if newChild.tail:
- newChild.tail = \
- markdown.inlinepatterns.handleAttributes(newChild.tail,
- element)
- if newChild.text:
- newChild.text = \
- markdown.inlinepatterns.handleAttributes(newChild.text,
- newChild)
+ for i, obj in enumerate(lst):
+ newChild = obj[0]
element.insert(i, newChild)
- i += 1
return tree
@@ -303,15 +401,13 @@ class PrettifyTreeprocessor(Treeprocessor):
""" Recursively add linebreaks to ElementTree children. """
i = "\n"
- if markdown.isBlockLevel(elem.tag) and elem.tag not in ['code', 'pre']:
+ if self.md.is_block_level(elem.tag) and elem.tag not in ['code', 'pre']:
if (not elem.text or not elem.text.strip()) \
- and len(elem) and markdown.isBlockLevel(elem[0].tag):
+ and len(elem) and self.md.is_block_level(elem[0].tag):
elem.text = i
for e in elem:
- if markdown.isBlockLevel(e.tag):
+ if self.md.is_block_level(e.tag):
self._prettifyETree(e)
- if not elem.tail or not elem.tail.strip():
- elem.tail = i
if not elem.tail or not elem.tail.strip():
elem.tail = i
@@ -319,11 +415,44 @@ class PrettifyTreeprocessor(Treeprocessor):
""" Add linebreaks to ElementTree root object. """
self._prettifyETree(root)
- # Do <br />'s seperately as they are often in the middle of
+ # Do <br />'s separately as they are often in the middle of
# inline content and missed by _prettifyETree.
- brs = root.getiterator('br')
+ brs = root.iter('br')
for br in brs:
if not br.tail or not br.tail.strip():
br.tail = '\n'
else:
br.tail = '\n%s' % br.tail
+ # Clean up extra empty lines at end of code blocks.
+ pres = root.iter('pre')
+ for pre in pres:
+ if len(pre) and pre[0].tag == 'code':
+ code = pre[0]
+ # Only prettify code containing text only
+ if not len(code) and code.text is not None:
+ code.text = util.AtomicString(code.text.rstrip() + '\n')
+
+
+class UnescapeTreeprocessor(Treeprocessor):
+ """ Restore escaped chars """
+
+ RE = re.compile(r'{}(\d+){}'.format(util.STX, util.ETX))
+
+ def _unescape(self, m):
+ return chr(int(m.group(1)))
+
+ def unescape(self, text):
+ return self.RE.sub(self._unescape, text)
+
+ def run(self, root):
+ """ Loop over all elements and unescape all text. """
+ for elem in root.iter():
+ # Unescape text content
+ if elem.text and not elem.tag == 'code':
+ elem.text = self.unescape(elem.text)
+ # Unescape tail content
+ if elem.tail:
+ elem.tail = self.unescape(elem.tail)
+ # Unescape attribute values
+ for key, value in elem.items():
+ elem.set(key, self.unescape(value))
diff --git a/markdown/util.py b/markdown/util.py
new file mode 100644
index 0000000..e6b08e5
--- /dev/null
+++ b/markdown/util.py
@@ -0,0 +1,358 @@
+"""
+Python Markdown
+
+A Python implementation of John Gruber's Markdown.
+
+Documentation: https://python-markdown.github.io/
+GitHub: https://github.com/Python-Markdown/markdown/
+PyPI: https://pypi.org/project/Markdown/
+
+Started by Manfred Stienstra (http://www.dwerg.net/).
+Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
+Currently maintained by Waylan Limberg (https://github.com/waylan),
+Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
+
+Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later)
+Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
+Copyright 2004 Manfred Stienstra (the original version)
+
+License: BSD (see LICENSE.md for details).
+"""
+
+import re
+import sys
+import warnings
+from collections import namedtuple
+from functools import wraps, lru_cache
+from itertools import count
+
+
+"""
+Constants you might want to modify
+-----------------------------------------------------------------------------
+"""
+
+
+BLOCK_LEVEL_ELEMENTS = [
+ # Elements which are invalid to wrap in a `<p>` tag.
+ # See https://w3c.github.io/html/grouping-content.html#the-p-element
+ 'address', 'article', 'aside', 'blockquote', 'details', 'div', 'dl',
+ 'fieldset', 'figcaption', 'figure', 'footer', 'form', 'h1', 'h2', 'h3',
+ 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'main', 'menu', 'nav', 'ol',
+ 'p', 'pre', 'section', 'table', 'ul',
+ # Other elements which Markdown should not be mucking up the contents of.
+ 'canvas', 'colgroup', 'dd', 'body', 'dt', 'group', 'iframe', 'li', 'legend',
+ 'math', 'map', 'noscript', 'output', 'object', 'option', 'progress', 'script',
+ 'style', 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'video'
+]
+
+# Placeholders
+STX = '\u0002' # Use STX ("Start of text") for start-of-placeholder
+ETX = '\u0003' # Use ETX ("End of text") for end-of-placeholder
+INLINE_PLACEHOLDER_PREFIX = STX+"klzzwxh:"
+INLINE_PLACEHOLDER = INLINE_PLACEHOLDER_PREFIX + "%s" + ETX
+INLINE_PLACEHOLDER_RE = re.compile(INLINE_PLACEHOLDER % r'([0-9]+)')
+AMP_SUBSTITUTE = STX+"amp"+ETX
+HTML_PLACEHOLDER = STX + "wzxhzdk:%s" + ETX
+HTML_PLACEHOLDER_RE = re.compile(HTML_PLACEHOLDER % r'([0-9]+)')
+TAG_PLACEHOLDER = STX + "hzzhzkh:%s" + ETX
+
+
+"""
+Constants you probably do not need to change
+-----------------------------------------------------------------------------
+"""
+
+RTL_BIDI_RANGES = (
+ ('\u0590', '\u07FF'),
+ # Hebrew (0590-05FF), Arabic (0600-06FF),
+ # Syriac (0700-074F), Arabic supplement (0750-077F),
+ # Thaana (0780-07BF), Nko (07C0-07FF).
+ ('\u2D30', '\u2D7F') # Tifinagh
+)
+
+
+"""
+AUXILIARY GLOBAL FUNCTIONS
+=============================================================================
+"""
+
+
+@lru_cache(maxsize=None)
+def get_installed_extensions():
+ if sys.version_info >= (3, 10):
+ from importlib import metadata
+ else: # <PY310 use backport
+ import importlib_metadata as metadata
+ # Only load extension entry_points once.
+ return metadata.entry_points(group='markdown.extensions')
+
+
+def deprecated(message, stacklevel=2):
+ """
+ Raise a DeprecationWarning when wrapped function/method is called.
+
+ Usage:
+ @deprecated("This method will be removed in version X; use Y instead.")
+ def some_method()"
+ pass
+ """
+ def wrapper(func):
+ @wraps(func)
+ def deprecated_func(*args, **kwargs):
+ warnings.warn(
+ f"'{func.__name__}' is deprecated. {message}",
+ category=DeprecationWarning,
+ stacklevel=stacklevel
+ )
+ return func(*args, **kwargs)
+ return deprecated_func
+ return wrapper
+
+
+def parseBoolValue(value, fail_on_errors=True, preserve_none=False):
+ """Parses a string representing bool value. If parsing was successful,
+ returns True or False. If preserve_none=True, returns True, False,
+ or None. If parsing was not successful, raises ValueError, or, if
+ fail_on_errors=False, returns None."""
+ if not isinstance(value, str):
+ if preserve_none and value is None:
+ return value
+ return bool(value)
+ elif preserve_none and value.lower() == 'none':
+ return None
+ elif value.lower() in ('true', 'yes', 'y', 'on', '1'):
+ return True
+ elif value.lower() in ('false', 'no', 'n', 'off', '0', 'none'):
+ return False
+ elif fail_on_errors:
+ raise ValueError('Cannot parse bool value: %r' % value)
+
+
+def code_escape(text):
+ """Escape code."""
+ if "&" in text:
+ text = text.replace("&", "&amp;")
+ if "<" in text:
+ text = text.replace("<", "&lt;")
+ if ">" in text:
+ text = text.replace(">", "&gt;")
+ return text
+
+
+def _get_stack_depth(size=2):
+ """Get current stack depth, performantly.
+ """
+ frame = sys._getframe(size)
+
+ for size in count(size):
+ frame = frame.f_back
+ if not frame:
+ return size
+
+
+def nearing_recursion_limit():
+ """Return true if current stack depth is within 100 of maximum limit."""
+ return sys.getrecursionlimit() - _get_stack_depth() < 100
+
+
+"""
+MISC AUXILIARY CLASSES
+=============================================================================
+"""
+
+
+class AtomicString(str):
+ """A string which should not be further processed."""
+ pass
+
+
+class Processor:
+ def __init__(self, md=None):
+ self.md = md
+
+
+class HtmlStash:
+ """
+ This class is used for stashing HTML objects that we extract
+ in the beginning and replace with place-holders.
+ """
+
+ def __init__(self):
+ """ Create a HtmlStash. """
+ self.html_counter = 0 # for counting inline html segments
+ self.rawHtmlBlocks = []
+ self.tag_counter = 0
+ self.tag_data = [] # list of dictionaries in the order tags appear
+
+ def store(self, html):
+ """
+ Saves an HTML segment for later reinsertion. Returns a
+ placeholder string that needs to be inserted into the
+ document.
+
+ Keyword arguments:
+
+ * html: an html segment
+
+ Returns : a placeholder string
+
+ """
+ self.rawHtmlBlocks.append(html)
+ placeholder = self.get_placeholder(self.html_counter)
+ self.html_counter += 1
+ return placeholder
+
+ def reset(self):
+ self.html_counter = 0
+ self.rawHtmlBlocks = []
+
+ def get_placeholder(self, key):
+ return HTML_PLACEHOLDER % key
+
+ def store_tag(self, tag, attrs, left_index, right_index):
+ """Store tag data and return a placeholder."""
+ self.tag_data.append({'tag': tag, 'attrs': attrs,
+ 'left_index': left_index,
+ 'right_index': right_index})
+ placeholder = TAG_PLACEHOLDER % str(self.tag_counter)
+ self.tag_counter += 1 # equal to the tag's index in self.tag_data
+ return placeholder
+
+
+# Used internally by `Registry` for each item in its sorted list.
+# Provides an easier to read API when editing the code later.
+# For example, `item.name` is more clear than `item[0]`.
+_PriorityItem = namedtuple('PriorityItem', ['name', 'priority'])
+
+
+class Registry:
+ """
+ A priority sorted registry.
+
+ A `Registry` instance provides two public methods to alter the data of the
+ registry: `register` and `deregister`. Use `register` to add items and
+ `deregister` to remove items. See each method for specifics.
+
+ When registering an item, a "name" and a "priority" must be provided. All
+ items are automatically sorted by "priority" from highest to lowest. The
+ "name" is used to remove ("deregister") and get items.
+
+ A `Registry` instance it like a list (which maintains order) when reading
+ data. You may iterate over the items, get an item and get a count (length)
+ of all items. You may also check that the registry contains an item.
+
+ When getting an item you may use either the index of the item or the
+ string-based "name". For example:
+
+ registry = Registry()
+ registry.register(SomeItem(), 'itemname', 20)
+ # Get the item by index
+ item = registry[0]
+ # Get the item by name
+ item = registry['itemname']
+
+ When checking that the registry contains an item, you may use either the
+ string-based "name", or a reference to the actual item. For example:
+
+ someitem = SomeItem()
+ registry.register(someitem, 'itemname', 20)
+ # Contains the name
+ assert 'itemname' in registry
+ # Contains the item instance
+ assert someitem in registry
+
+ The method `get_index_for_name` is also available to obtain the index of
+ an item using that item's assigned "name".
+ """
+
+ def __init__(self):
+ self._data = {}
+ self._priority = []
+ self._is_sorted = False
+
+ def __contains__(self, item):
+ if isinstance(item, str):
+ # Check if an item exists by this name.
+ return item in self._data.keys()
+ # Check if this instance exists.
+ return item in self._data.values()
+
+ def __iter__(self):
+ self._sort()
+ return iter([self._data[k] for k, p in self._priority])
+
+ def __getitem__(self, key):
+ self._sort()
+ if isinstance(key, slice):
+ data = Registry()
+ for k, p in self._priority[key]:
+ data.register(self._data[k], k, p)
+ return data
+ if isinstance(key, int):
+ return self._data[self._priority[key].name]
+ return self._data[key]
+
+ def __len__(self):
+ return len(self._priority)
+
+ def __repr__(self):
+ return '<{}({})>'.format(self.__class__.__name__, list(self))
+
+ def get_index_for_name(self, name):
+ """
+ Return the index of the given name.
+ """
+ if name in self:
+ self._sort()
+ return self._priority.index(
+ [x for x in self._priority if x.name == name][0]
+ )
+ raise ValueError('No item named "{}" exists.'.format(name))
+
+ def register(self, item, name, priority):
+ """
+ Add an item to the registry with the given name and priority.
+
+ Parameters:
+
+ * `item`: The item being registered.
+ * `name`: A string used to reference the item.
+ * `priority`: An integer or float used to sort against all items.
+
+ If an item is registered with a "name" which already exists, the
+ existing item is replaced with the new item. Treat carefully as the
+ old item is lost with no way to recover it. The new item will be
+ sorted according to its priority and will **not** retain the position
+ of the old item.
+ """
+ if name in self:
+ # Remove existing item of same name first
+ self.deregister(name)
+ self._is_sorted = False
+ self._data[name] = item
+ self._priority.append(_PriorityItem(name, priority))
+
+ def deregister(self, name, strict=True):
+ """
+ Remove an item from the registry.
+
+ Set `strict=False` to fail silently.
+ """
+ try:
+ index = self.get_index_for_name(name)
+ del self._priority[index]
+ del self._data[name]
+ except ValueError:
+ if strict:
+ raise
+
+ def _sort(self):
+ """
+ Sort the registry by priority from highest to lowest.
+
+ This method is called internally and should never be explicitly called.
+ """
+ if not self._is_sorted:
+ self._priority.sort(key=lambda item: item.priority, reverse=True)
+ self._is_sorted = True