Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

More thorough cleanup of input whitespace #151

Open
wants to merge 4 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
122 changes: 81 additions & 41 deletions markdownify/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
convert_heading_re = re.compile(r'convert_h(\d+)')
line_beginning_re = re.compile(r'^', re.MULTILINE)
whitespace_re = re.compile(r'[\t ]+')
all_whitespace_re = re.compile(r'[\s]+')
all_whitespace_re = re.compile(r'[\t \r\n]+')
newline_whitespace_re = re.compile(r'[\t \r\n]*[\r\n][\t \r\n]*')
html_heading_re = re.compile(r'h[1-6]')


Expand Down Expand Up @@ -66,6 +67,23 @@ def _todict(obj):
return dict((k, getattr(obj, k)) for k in dir(obj) if not k.startswith('_'))


def remove_whitespace_inside(el):
"""Return to remove whitespace immediately inside a block-level element."""
if not el or not el.name:
return False
if html_heading_re.match(el.name) is not None:
return True
return el.name in ('p', 'blockquote',
'ol', 'ul', 'li',
'table', 'thead', 'tbody', 'tfoot',
'tr', 'td', 'th')


def remove_whitespace_outside(el):
"""Return to remove whitespace immediately outside a block-level element."""
return remove_whitespace_inside(el) or (el and el.name == 'pre')


class MarkdownConverter(object):
class DefaultOptions:
autolinks = True
Expand Down Expand Up @@ -119,27 +137,23 @@ def process_tag(self, node, convert_as_inline, children_only=False):
if not children_only and (isHeading or isCell):
convert_children_as_inline = True

# Remove whitespace-only textnodes in purely nested nodes
def is_nested_node(el):
return el and el.name in ['ol', 'ul', 'li',
'table', 'thead', 'tbody', 'tfoot',
'tr', 'td', 'th']

if is_nested_node(node):
for el in node.children:
# Only extract (remove) whitespace-only text node if any of the
# conditions is true:
# - el is the first element in its parent
# - el is the last element in its parent
# - el is adjacent to an nested node
can_extract = (not el.previous_sibling
or not el.next_sibling
or is_nested_node(el.previous_sibling)
or is_nested_node(el.next_sibling))
if (isinstance(el, NavigableString)
and six.text_type(el).strip() == ''
and can_extract):
el.extract()
# Remove whitespace-only textnodes just before, after or
# inside block-level elements.
remove_inside = remove_whitespace_inside(node)
for el in node.children:
# Only extract (remove) whitespace-only text node if any of the
# conditions is true:
# - el is the first element in its parent (block-level)
# - el is the last element in its parent (block-level)
# - el is adjacent to a block-level node
can_extract = (remove_inside and (not el.previous_sibling
or not el.next_sibling)
or remove_whitespace_outside(el.previous_sibling)
or remove_whitespace_outside(el.next_sibling))
if (isinstance(el, NavigableString)
and six.text_type(el).strip() == ''
and can_extract):
el.extract()

# Convert the children first
for el in node.children:
Expand All @@ -148,7 +162,13 @@ def is_nested_node(el):
elif isinstance(el, NavigableString):
text += self.process_text(el)
else:
text += self.process_tag(el, convert_children_as_inline)
text_strip = text.rstrip('\n')
newlines_left = len(text) - len(text_strip)
next_text = self.process_tag(el, convert_children_as_inline)
next_text_strip = next_text.lstrip('\n')
newlines_right = len(next_text) - len(next_text_strip)
newlines = '\n' * max(newlines_left, newlines_right)
text = text_strip + newlines + next_text_strip

if not children_only:
convert_fn = getattr(self, 'convert_%s' % node.name, None)
Expand All @@ -162,18 +182,26 @@ def process_text(self, el):

# normalize whitespace if we're not inside a preformatted element
if not el.find_parent('pre'):
text = whitespace_re.sub(' ', text)
if self.options['wrap']:
text = all_whitespace_re.sub(' ', text)
else:
text = newline_whitespace_re.sub('\n', text)
text = whitespace_re.sub(' ', text)

# escape special characters if we're not inside a preformatted or code element
if not el.find_parent(['pre', 'code', 'kbd', 'samp']):
text = self.escape(text)

# remove trailing whitespaces if any of the following condition is true:
# - current text node is the last node in li
# - current text node is followed by an embedded list
if (el.parent.name == 'li'
and (not el.next_sibling
or el.next_sibling.name in ['ul', 'ol'])):
# remove leading whitespace at the start or just after a
# block-level element; remove traliing whitespace at the end
# or just before a block-level element.
if (remove_whitespace_outside(el.previous_sibling)
or (remove_whitespace_inside(el.parent)
and not el.previous_sibling)):
text = text.lstrip()
if (remove_whitespace_outside(el.next_sibling)
or (remove_whitespace_inside(el.parent)
and not el.next_sibling)):
text = text.rstrip()

return text
Expand Down Expand Up @@ -221,7 +249,7 @@ def indent(self, text, level):

def underline(self, text, pad_char):
text = (text or '').rstrip()
return '%s\n%s\n\n' % (text, pad_char * len(text)) if text else ''
return '\n\n%s\n%s\n\n' % (text, pad_char * len(text)) if text else ''

def convert_a(self, el, text, convert_as_inline):
prefix, suffix, text = chomp(text)
Expand All @@ -246,7 +274,7 @@ def convert_a(self, el, text, convert_as_inline):
def convert_blockquote(self, el, text, convert_as_inline):

if convert_as_inline:
return text
return ' ' + text.strip() + ' '

return '\n' + (line_beginning_re.sub('> ', text.strip()) + '\n\n') if text else ''

Expand Down Expand Up @@ -280,10 +308,11 @@ def convert_hn(self, n, el, text, convert_as_inline):
if style == UNDERLINED and n <= 2:
line = '=' if n == 1 else '-'
return self.underline(text, line)
text = all_whitespace_re.sub(' ', text)
hashes = '#' * n
if style == ATX_CLOSED:
return '%s %s %s\n\n' % (hashes, text, hashes)
return '%s %s\n\n' % (hashes, text)
return '\n%s %s %s\n\n' % (hashes, text, hashes)
return '\n%s %s\n\n' % (hashes, text)

def convert_hr(self, el, text, convert_as_inline):
return '\n\n---\n\n'
Expand Down Expand Up @@ -318,7 +347,7 @@ def convert_list(self, el, text, convert_as_inline):
if nested:
# remove trailing newline if nested
return '\n' + self.indent(text, 1).rstrip()
return text + ('\n' if before_paragraph else '')
return '\n\n' + text + ('\n' if before_paragraph else '')

convert_ul = convert_list
convert_ol = convert_list
Expand All @@ -343,13 +372,24 @@ def convert_li(self, el, text, convert_as_inline):

def convert_p(self, el, text, convert_as_inline):
if convert_as_inline:
return text
return ' ' + text.strip() + ' '
if self.options['wrap']:
text = fill(text,
width=self.options['wrap_width'],
break_long_words=False,
break_on_hyphens=False)
return '%s\n\n' % text if text else ''
# Preserve newlines (and preceding whitespace) resulting
# from <br> tags. Newlines in the input have already been
# replaced by spaces.
lines = text.split('\n')
new_lines = []
for line in lines:
line = line.lstrip()
line_no_trailing = line.rstrip()
trailing = line[len(line_no_trailing):]
line = fill(line,
width=self.options['wrap_width'],
break_long_words=False,
break_on_hyphens=False)
new_lines.append(line + trailing)
text = '\n'.join(new_lines)
return '\n\n%s\n\n' % text if text else ''

def convert_pre(self, el, text, convert_as_inline):
if not text:
Expand Down
2 changes: 1 addition & 1 deletion tests/test_advanced.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def test_chomp():

def test_nested():
text = md('<p>This is an <a href="http://example.com/">example link</a>.</p>')
assert text == 'This is an [example link](http://example.com/).\n\n'
assert text == '\n\nThis is an [example link](http://example.com/).\n\n'


def test_ignore_comments():
Expand Down
1 change: 1 addition & 0 deletions tests/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ def test_soup():

def test_whitespace():
assert md(' a b \t\t c ') == ' a b c '
assert md(' a b \n\n c ') == ' a b\nc '
90 changes: 57 additions & 33 deletions tests/test_conversions.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from markdownify import markdownify as md, ATX, ATX_CLOSED, BACKSLASH, UNDERSCORE
from markdownify import markdownify as md, ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERSCORE


def inline_tests(tag, markup):
Expand Down Expand Up @@ -66,7 +66,7 @@ def test_blockquote_with_paragraph():

def test_blockquote_nested():
text = md('<blockquote>And she was like <blockquote>Hello</blockquote></blockquote>')
assert text == '\n> And she was like \n> > Hello\n\n'
assert text == '\n> And she was like\n> > Hello\n\n'


def test_br():
Expand Down Expand Up @@ -112,36 +112,39 @@ def test_em():


def test_header_with_space():
assert md('<h3>\n\nHello</h3>') == '### Hello\n\n'
assert md('<h4>\n\nHello</h4>') == '#### Hello\n\n'
assert md('<h5>\n\nHello</h5>') == '##### Hello\n\n'
assert md('<h5>\n\nHello\n\n</h5>') == '##### Hello\n\n'
assert md('<h5>\n\nHello \n\n</h5>') == '##### Hello\n\n'
assert md('<h3>\n\nHello</h3>') == '\n### Hello\n\n'
assert md('<h3>Hello\n\n\nWorld</h3>') == '\n### Hello World\n\n'
assert md('<h4>\n\nHello</h4>') == '\n#### Hello\n\n'
assert md('<h5>\n\nHello</h5>') == '\n##### Hello\n\n'
assert md('<h5>\n\nHello\n\n</h5>') == '\n##### Hello\n\n'
assert md('<h5>\n\nHello \n\n</h5>') == '\n##### Hello\n\n'


def test_h1():
assert md('<h1>Hello</h1>') == 'Hello\n=====\n\n'
assert md('<h1>Hello</h1>') == '\n\nHello\n=====\n\n'


def test_h2():
assert md('<h2>Hello</h2>') == 'Hello\n-----\n\n'
assert md('<h2>Hello</h2>') == '\n\nHello\n-----\n\n'


def test_hn():
assert md('<h3>Hello</h3>') == '### Hello\n\n'
assert md('<h4>Hello</h4>') == '#### Hello\n\n'
assert md('<h5>Hello</h5>') == '##### Hello\n\n'
assert md('<h6>Hello</h6>') == '###### Hello\n\n'
assert md('<h3>Hello</h3>') == '\n### Hello\n\n'
assert md('<h4>Hello</h4>') == '\n#### Hello\n\n'
assert md('<h5>Hello</h5>') == '\n##### Hello\n\n'
assert md('<h6>Hello</h6>') == '\n###### Hello\n\n'


def test_hn_chained():
assert md('<h1>First</h1>\n<h2>Second</h2>\n<h3>Third</h3>', heading_style=ATX) == '# First\n\n\n## Second\n\n\n### Third\n\n'
assert md('X<h1>First</h1>', heading_style=ATX) == 'X# First\n\n'
assert md('<h1>First</h1>\n<h2>Second</h2>\n<h3>Third</h3>', heading_style=ATX) == '\n# First\n\n## Second\n\n### Third\n\n'
assert md('X<h1>First</h1>', heading_style=ATX) == 'X\n# First\n\n'
assert md('X<h1>First</h1>', heading_style=ATX_CLOSED) == 'X\n# First #\n\n'
assert md('X<h1>First</h1>') == 'X\n\nFirst\n=====\n\n'


def test_hn_nested_tag_heading_style():
assert md('<h1>A <p>P</p> C </h1>', heading_style=ATX_CLOSED) == '# A P C #\n\n'
assert md('<h1>A <p>P</p> C </h1>', heading_style=ATX) == '# A P C\n\n'
assert md('<h1>A <p>P</p> C </h1>', heading_style=ATX_CLOSED) == '\n# A P C #\n\n'
assert md('<h1>A <p>P</p> C </h1>', heading_style=ATX) == '\n# A P C\n\n'


def test_hn_nested_simple_tag():
Expand All @@ -157,12 +160,12 @@ def test_hn_nested_simple_tag():
]

for tag, markdown in tag_to_markdown:
assert md('<h3>A <' + tag + '>' + tag + '</' + tag + '> B</h3>') == '### A ' + markdown + ' B\n\n'
assert md('<h3>A <' + tag + '>' + tag + '</' + tag + '> B</h3>') == '\n### A ' + markdown + ' B\n\n'

assert md('<h3>A <br>B</h3>', heading_style=ATX) == '### A B\n\n'
assert md('<h3>A <br>B</h3>', heading_style=ATX) == '\n### A B\n\n'

# Nested lists not supported
# assert md('<h3>A <ul><li>li1</i><li>l2</li></ul></h3>', heading_style=ATX) == '### A li1 li2 B\n\n'
# assert md('<h3>A <ul><li>li1</i><li>l2</li></ul></h3>', heading_style=ATX) == '\n### A li1 li2 B\n\n'


def test_hn_nested_img():
Expand All @@ -172,18 +175,18 @@ def test_hn_nested_img():
("alt='Alt Text' title='Optional title'", "Alt Text", " \"Optional title\""),
]
for image_attributes, markdown, title in image_attributes_to_markdown:
assert md('<h3>A <img src="/path/to/img.jpg" ' + image_attributes + '/> B</h3>') == '### A ' + markdown + ' B\n\n'
assert md('<h3>A <img src="/path/to/img.jpg" ' + image_attributes + '/> B</h3>', keep_inline_images_in=['h3']) == '### A ![' + markdown + '](/path/to/img.jpg' + title + ') B\n\n'
assert md('<h3>A <img src="/path/to/img.jpg" ' + image_attributes + '/> B</h3>') == '\n### A' + (' ' + markdown + ' ' if markdown else ' ') + 'B\n\n'
assert md('<h3>A <img src="/path/to/img.jpg" ' + image_attributes + '/> B</h3>', keep_inline_images_in=['h3']) == '\n### A ![' + markdown + '](/path/to/img.jpg' + title + ') B\n\n'


def test_hn_atx_headings():
assert md('<h1>Hello</h1>', heading_style=ATX) == '# Hello\n\n'
assert md('<h2>Hello</h2>', heading_style=ATX) == '## Hello\n\n'
assert md('<h1>Hello</h1>', heading_style=ATX) == '\n# Hello\n\n'
assert md('<h2>Hello</h2>', heading_style=ATX) == '\n## Hello\n\n'


def test_hn_atx_closed_headings():
assert md('<h1>Hello</h1>', heading_style=ATX_CLOSED) == '# Hello #\n\n'
assert md('<h2>Hello</h2>', heading_style=ATX_CLOSED) == '## Hello ##\n\n'
assert md('<h1>Hello</h1>', heading_style=ATX_CLOSED) == '\n# Hello #\n\n'
assert md('<h2>Hello</h2>', heading_style=ATX_CLOSED) == '\n## Hello ##\n\n'


def test_head():
Expand All @@ -193,7 +196,7 @@ def test_head():
def test_hr():
assert md('Hello<hr>World') == 'Hello\n\n---\n\nWorld'
assert md('Hello<hr />World') == 'Hello\n\n---\n\nWorld'
assert md('<p>Hello</p>\n<hr>\n<p>World</p>') == 'Hello\n\n\n\n\n---\n\n\nWorld\n\n'
assert md('<p>Hello</p>\n<hr>\n<p>World</p>') == '\n\nHello\n\n---\n\nWorld\n\n'


def test_i():
Expand All @@ -210,12 +213,23 @@ def test_kbd():


def test_p():
assert md('<p>hello</p>') == 'hello\n\n'
assert md('<p>123456789 123456789</p>') == '123456789 123456789\n\n'
assert md('<p>123456789 123456789</p>', wrap=True, wrap_width=10) == '123456789\n123456789\n\n'
assert md('<p><a href="https://example.com">Some long link</a></p>', wrap=True, wrap_width=10) == '[Some long\nlink](https://example.com)\n\n'
assert md('<p>12345<br />67890</p>', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '12345\\\n67890\n\n'
assert md('<p>12345678901<br />12345</p>', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '12345678901\\\n12345\n\n'
assert md('<p>hello</p>') == '\n\nhello\n\n'
assert md('<p>123456789 123456789</p>') == '\n\n123456789 123456789\n\n'
assert md('<p>123456789\n\n\n123456789</p>') == '\n\n123456789\n123456789\n\n'
assert md('<p>123456789\n\n\n123456789</p>', wrap=True, wrap_width=80) == '\n\n123456789 123456789\n\n'
assert md('<p>123456789 123456789</p>', wrap=True, wrap_width=10) == '\n\n123456789\n123456789\n\n'
assert md('<p><a href="https://example.com">Some long link</a></p>', wrap=True, wrap_width=10) == '\n\n[Some long\nlink](https://example.com)\n\n'
assert md('<p>12345<br />67890</p>', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '\n\n12345\\\n67890\n\n'
assert md('<p>12345<br />67890</p>', wrap=True, wrap_width=50, newline_style=BACKSLASH) == '\n\n12345\\\n67890\n\n'
assert md('<p>12345<br />67890</p>', wrap=True, wrap_width=10, newline_style=SPACES) == '\n\n12345 \n67890\n\n'
assert md('<p>12345<br />67890</p>', wrap=True, wrap_width=50, newline_style=SPACES) == '\n\n12345 \n67890\n\n'
assert md('<p>12345678901<br />12345</p>', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '\n\n12345678901\\\n12345\n\n'
assert md('<p>12345678901<br />12345</p>', wrap=True, wrap_width=50, newline_style=BACKSLASH) == '\n\n12345678901\\\n12345\n\n'
assert md('<p>12345678901<br />12345</p>', wrap=True, wrap_width=10, newline_style=SPACES) == '\n\n12345678901 \n12345\n\n'
assert md('<p>12345678901<br />12345</p>', wrap=True, wrap_width=50, newline_style=SPACES) == '\n\n12345678901 \n12345\n\n'
assert md('<p>1234 5678 9012<br />67890</p>', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '\n\n1234 5678\n9012\\\n67890\n\n'
assert md('<p>1234 5678 9012<br />67890</p>', wrap=True, wrap_width=10, newline_style=SPACES) == '\n\n1234 5678\n9012 \n67890\n\n'
assert md('First<p>Second</p><p>Third</p>Fourth') == 'First\n\nSecond\n\nThird\n\nFourth'


def test_pre():
Expand Down Expand Up @@ -289,3 +303,13 @@ def callback(el):
assert md('<pre class="python">test\n foo\nbar</pre>', code_language_callback=callback) == '\n```python\ntest\n foo\nbar\n```\n'
assert md('<pre class="javascript"><code>test\n foo\nbar</code></pre>', code_language_callback=callback) == '\n```javascript\ntest\n foo\nbar\n```\n'
assert md('<pre class="javascript"><code class="javascript">test\n foo\nbar</code></pre>', code_language_callback=callback) == '\n```javascript\ntest\n foo\nbar\n```\n'


def test_spaces():
assert md('<p> a b </p> <p> c d </p>') == '\n\na b\n\nc d\n\n'
assert md('<p> <i>a</i> </p>') == '\n\n*a*\n\n'
assert md('test <p> again </p>') == 'test\n\nagain\n\n'
assert md('test <blockquote> text </blockquote> after') == 'test\n> text\n\nafter'
assert md(' <ol> <li> x </li> <li> y </li> </ol> ') == '\n\n1. x\n2. y\n'
assert md(' <ul> <li> x </li> <li> y </li> </ol> ') == '\n\n* x\n* y\n'
assert md('test <pre> foo </pre> bar') == 'test\n```\n foo \n```\nbar'
Loading
Loading