fix: Add back patch for converting wiki content from html to markdown

frappe · Sep 18, 2024 · ef14e24 · ef14e24
1 parent 18ef051
commit ef14e24
Show file tree

Hide file tree

Showing 2 changed files with 73 additions and 0 deletions.
diff --git a/wiki/patches.txt b/wiki/patches.txt
@@ -7,3 +7,4 @@ wiki.wiki.doctype.wiki_feedback.patches.delete_wiki_feedback_item
 [post_model_sync]
 wiki.wiki.doctype.wiki_space.patches.wiki_sidebar_migration
 wiki.wiki.doctype.wiki_settings.patches.wiki_navbar_item_migration
+wiki.wiki.doctype.wiki_page.patches.convert_wiki_content_to_markdown
diff --git a/wiki/wiki/doctype/wiki_page/patches/convert_wiki_content_to_markdown.py b/wiki/wiki/doctype/wiki_page/patches/convert_wiki_content_to_markdown.py
@@ -0,0 +1,72 @@
+import re
+
+import frappe
+import six
+from bs4 import Comment, Doctype, NavigableString
+from markdownify import MarkdownConverter
+
+html_heading_re = re.compile(r"h[1-6]")
+
+
+class CustomMarkdownConverter(MarkdownConverter):
+	# overeride markdownify's process_tag function to escape certain html tags
+	def process_tag(self, node, convert_as_inline, children_only=False):
+		text = ""
+
+		# markdown headings or cells can't include
+		# block elements (elements w/newlines)
+		isHeading = html_heading_re.match(node.name) is not None
+		isCell = node.name in ["td", "th"]
+		convert_children_as_inline = convert_as_inline
+
+		if not children_only and (isHeading or isCell):
+			convert_children_as_inline = True
+
+		# Remove whitespace-only textnodes in purely nested nodes
+		def is_nested_node(el):
+			return el and el.name in ["ol", "ul", "li", "table", "thead", "tbody", "tfoot", "tr", "td", "th"]
+
+		if is_nested_node(node):
+			for el in node.children:
+				# Only extract (remove) whitespace-only text node if any of the
+				# conditions is true:
+				# - el is the first element in its parent
+				# - el is the last element in its parent
+				# - el is adjacent to an nested node
+				can_extract = (
+					not el.previous_sibling
+					or not el.next_sibling
+					or is_nested_node(el.previous_sibling)
+					or is_nested_node(el.next_sibling)
+				)
+				if isinstance(el, NavigableString) and six.text_type(el).strip() == "" and can_extract:
+					el.extract()
+
+		# Convert the children first
+		for el in node.children:
+			if isinstance(el, Comment) or isinstance(el, Doctype):
+				continue
+			elif isinstance(el, NavigableString):
+				text += self.process_text(el)
+			else:
+				if el.name in ["video", "iframe", "audio", "embed", "object", "source", "picture", "math"]:
+					text += self.process_text(el)
+				text += self.process_tag(el, convert_children_as_inline)
+
+		if not children_only:
+			convert_fn = getattr(self, f"convert_{node.name}", None)
+			if convert_fn and self.should_convert_tag(node.name):
+				text = convert_fn(node, text, convert_as_inline)
+
+		return text
+
+
+def custom_markdownify(html, **options):
+	return CustomMarkdownConverter(**options).convert(html)
+
+
+def execute():
+	wiki_pages = frappe.db.get_all("Wiki Page", fields=["name", "content"])
+	for page in wiki_pages:
+		markdown_content = custom_markdownify(page["content"])
+		frappe.db.set_value("Wiki Page", page["name"], "content", markdown_content)