#!/usr/bin/env python3 # -*- coding: utf-8 -*- """Convert knowledge_report.html to Word document using python-docx""" from docx import Document from docx.shared import Pt, Inches, Cm, RGBColor from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.enum.table import WD_TABLE_ALIGNMENT from docx.oxml.ns import qn from html.parser import HTMLParser import re class HTMLToDocx: def __init__(self): self.doc = Document() self.setup_styles() self.in_cover = False self.in_exam = False self.in_table = False self.current_tr = [] self.current_cells = [] self.in_th = False self.in_td = False self.in_ul = False self.in_ol = False self.list_counter = 0 self.current_text = '' # Page margins for section in self.doc.sections: section.top_margin = Cm(2.5) section.bottom_margin = Cm(2.0) section.left_margin = Cm(2.5) section.right_margin = Cm(2.5) def setup_styles(self): style = self.doc.styles['Normal'] font = style.font font.name = 'Microsoft YaHei' font.size = Pt(11) style.element.rPr.rFonts.set(qn('w:eastAsia'), 'Microsoft YaHei') # Title style title_style = self.doc.styles['Title'] title_style.font.size = Pt(22) title_style.font.color.rgb = RGBColor(0x1a, 0x52, 0x76) # Heading styles for i, (size, color) in enumerate([ (Pt(16), RGBColor(0x2c, 0x3e, 0x50)), # Heading 1 (Pt(14), RGBColor(0x2c, 0x3e, 0x50)), # Heading 2 (Pt(12), RGBColor(0x34, 0x49, 0x5e)), # Heading 3 ], 1): h_style = self.doc.styles[f'Heading {i}'] h_style.font.size = size h_style.font.color.rgb = color h_style.font.bold = True h_style.paragraph_format.space_before = Pt(12) h_style.paragraph_format.space_after = Pt(6) def add_paragraph(self, text, bold=False, indent=True, align=None, size=None, color=None, spacing_after=Pt(6)): p = self.doc.add_paragraph() run = p.add_run(text) run.font.name = 'Microsoft YaHei' run._element.rPr.rFonts.set(qn('w:eastAsia'), 'Microsoft YaHei') if bold: run.bold = True if size: run.font.size = size if color: run.font.color.rgb = color if indent: p.paragraph_format.first_line_indent = Cm(0.75) if align: p.alignment = align p.paragraph_format.space_after = spacing_after return p def add_heading_text(self, text, level=1): p = self.doc.add_heading(text, level=level) for run in p.runs: run.font.name = 'Microsoft YaHei' run._element.rPr.rFonts.set(qn('w:eastAsia'), 'Microsoft YaHei') return p def add_table_from_data(self, rows): """Add a table from parsed rows""" if not rows: return max_cols = max(len(r) for r in rows) if max_cols < 2: return table = self.doc.add_table(rows=len(rows), cols=max_cols) table.alignment = WD_TABLE_ALIGNMENT.CENTER table.style = 'Table Grid' for i, row_data in enumerate(rows): for j, cell_text in enumerate(row_data): if j >= max_cols: break cell = table.cell(i, j) cell.text = str(cell_text) for paragraph in cell.paragraphs: paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER for run in paragraph.runs: run.font.size = Pt(9) run.font.name = 'Microsoft YaHei' run._element.rPr.rFonts.set(qn('w:eastAsia'), 'Microsoft YaHei') if i == 0: run.bold = True # Bold first row for j in range(max_cols): cell = table.cell(0, j) for paragraph in cell.paragraphs: for run in paragraph.runs: run.bold = True self.doc.add_paragraph() # spacing def process_html(self, html): """Parse HTML and build docx""" lines = html.split('\n') in_code = False in_table_data = False table_rows = [] in_div_exam = False exam_content = [] i = 0 while i < len(lines): line = lines[i].strip() # Skip style/head/meta/body tags if ('