Skip to content

Commit

Permalink
fix: Add back patch for converting wiki content from html to markdown
Browse files Browse the repository at this point in the history
  • Loading branch information
AyshaHakeem committed Sep 18, 2024
1 parent 18ef051 commit ef14e24
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 0 deletions.
1 change: 1 addition & 0 deletions wiki/patches.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ wiki.wiki.doctype.wiki_feedback.patches.delete_wiki_feedback_item
[post_model_sync]
wiki.wiki.doctype.wiki_space.patches.wiki_sidebar_migration
wiki.wiki.doctype.wiki_settings.patches.wiki_navbar_item_migration
wiki.wiki.doctype.wiki_page.patches.convert_wiki_content_to_markdown
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import re

import frappe
import six
from bs4 import Comment, Doctype, NavigableString
from markdownify import MarkdownConverter

html_heading_re = re.compile(r"h[1-6]")


class CustomMarkdownConverter(MarkdownConverter):
# overeride markdownify's process_tag function to escape certain html tags
def process_tag(self, node, convert_as_inline, children_only=False):
text = ""

# markdown headings or cells can't include
# block elements (elements w/newlines)
isHeading = html_heading_re.match(node.name) is not None
isCell = node.name in ["td", "th"]
convert_children_as_inline = convert_as_inline

if not children_only and (isHeading or isCell):
convert_children_as_inline = True

# Remove whitespace-only textnodes in purely nested nodes
def is_nested_node(el):
return el and el.name in ["ol", "ul", "li", "table", "thead", "tbody", "tfoot", "tr", "td", "th"]

if is_nested_node(node):
for el in node.children:
# Only extract (remove) whitespace-only text node if any of the
# conditions is true:
# - el is the first element in its parent
# - el is the last element in its parent
# - el is adjacent to an nested node
can_extract = (
not el.previous_sibling
or not el.next_sibling
or is_nested_node(el.previous_sibling)
or is_nested_node(el.next_sibling)
)
if isinstance(el, NavigableString) and six.text_type(el).strip() == "" and can_extract:
el.extract()

# Convert the children first
for el in node.children:
if isinstance(el, Comment) or isinstance(el, Doctype):
continue
elif isinstance(el, NavigableString):
text += self.process_text(el)
else:
if el.name in ["video", "iframe", "audio", "embed", "object", "source", "picture", "math"]:
text += self.process_text(el)
text += self.process_tag(el, convert_children_as_inline)

if not children_only:
convert_fn = getattr(self, f"convert_{node.name}", None)
if convert_fn and self.should_convert_tag(node.name):
text = convert_fn(node, text, convert_as_inline)

return text


def custom_markdownify(html, **options):
return CustomMarkdownConverter(**options).convert(html)


def execute():
wiki_pages = frappe.db.get_all("Wiki Page", fields=["name", "content"])
for page in wiki_pages:
markdown_content = custom_markdownify(page["content"])
frappe.db.set_value("Wiki Page", page["name"], "content", markdown_content)

0 comments on commit ef14e24

Please sign in to comment.