hwpforge_smithy_md/
decoder.rs

1//! Markdown -> Core decoder.
2
3use std::path::Path;
4
5use hwpforge_blueprint::builtins::builtin_default;
6
7/// Maximum markdown file size: 50 MB.
8const MAX_MD_FILE_SIZE: u64 = 50 * 1024 * 1024;
9use hwpforge_blueprint::registry::StyleRegistry;
10use hwpforge_blueprint::template::Template;
11use hwpforge_core::{
12    Control, Document, Image, Paragraph, Run, RunContent, Section, Table, TableCell, TableRow,
13};
14use hwpforge_foundation::{CharShapeIndex, HwpUnit, ParaShapeIndex, StyleIndex};
15use pulldown_cmark::{Event, HeadingLevel, Options, Parser, Tag, TagEnd};
16
17use crate::error::{MdError, MdResult};
18use crate::frontmatter::{apply_to_metadata, extract_frontmatter};
19use crate::mapper::{image_format_from_path, resolve_mapping, MdMapping, MdStyleRef};
20
21mod lossless;
22
23const SECTION_MARKER_COMMENT: &str = "<!-- hwpforge:section -->";
24
25/// Returns `true` if the URL uses a safe scheme for hyperlinks.
26///
27/// Rejects `javascript:`, `data:`, `file:`, and similar schemes that can
28/// execute code or access local resources when rendered.
29fn is_safe_url(url: &str) -> bool {
30    if url.is_empty() {
31        return true;
32    }
33    let lower = url.to_ascii_lowercase();
34    lower.starts_with("http://") || lower.starts_with("https://") || lower.starts_with("mailto:")
35}
36
37/// Result of decoding markdown, containing both the document and the
38/// [`StyleRegistry`] resolved from the template.
39///
40/// Keeping these together lets callers pass the registry downstream
41/// (e.g. to the HWPX encoder) without re-resolving the template.
42///
43/// # Examples
44///
45/// ```rust,ignore
46/// use hwpforge_blueprint::builtins::builtin_default;
47/// use hwpforge_smithy_hwpx::HwpxStyleStore;
48/// use hwpforge_smithy_md::MdDecoder;
49///
50/// let template = builtin_default().unwrap();
51/// let result = MdDecoder::decode("# Title\n\nBody text", &template).unwrap();
52///
53/// // Access the document
54/// let doc = result.document.validate().unwrap();
55///
56/// // Bridge styles to HWPX encoder
57/// let store = HwpxStyleStore::from_registry(&result.style_registry);
58/// ```
59#[derive(Debug)]
60pub struct MdDocument {
61    /// The decoded Core document.
62    pub document: Document,
63    /// The style registry resolved from the template.
64    pub style_registry: StyleRegistry,
65}
66
67/// Markdown decoder.
68pub struct MdDecoder;
69
70impl MdDecoder {
71    /// Decodes markdown into a Core draft document **and** its style registry.
72    ///
73    /// The template is used for paragraph/character style index mapping.
74    /// Built-in template inheritance (`default`/`gov_proposal`) is resolved
75    /// automatically.
76    pub fn decode(markdown: &str, template: &Template) -> MdResult<MdDocument> {
77        let extracted = extract_frontmatter(markdown)?;
78        let (mapping, style_registry) = resolve_mapping(template)?;
79
80        let mut document = Document::new();
81        if let Some(frontmatter) = extracted.frontmatter.as_ref() {
82            apply_to_metadata(frontmatter, document.metadata_mut());
83        }
84
85        let mut state = DecoderState::new(&mapping);
86        state.decode_markdown(extracted.content)?;
87        let decoded = state.finish()?;
88
89        let mut sections = split_sections(decoded.paragraphs, &decoded.section_breaks);
90        if sections.is_empty() {
91            sections.push(vec![empty_paragraph(mapping.body)]);
92        }
93
94        for mut section_paragraphs in sections {
95            if section_paragraphs.is_empty() {
96                section_paragraphs.push(empty_paragraph(mapping.body));
97            }
98            document
99                .add_section(Section::with_paragraphs(section_paragraphs, mapping.page_settings));
100        }
101
102        Ok(MdDocument { document, style_registry })
103    }
104
105    /// Decodes lossless markdown output back into a Core draft document.
106    ///
107    /// This parses the lossless HTML-like body produced by
108    /// [`crate::MdEncoder::encode_lossless`], preserving paragraph/run shape IDs
109    /// and control/table structures.
110    pub fn decode_lossless(markdown: &str) -> MdResult<Document> {
111        let extracted = extract_frontmatter(markdown)?;
112        let sections = lossless::decode_lossless_sections(extracted.content)?;
113
114        let mut document = Document::new();
115        if let Some(frontmatter) = extracted.frontmatter.as_ref() {
116            apply_to_metadata(frontmatter, document.metadata_mut());
117        }
118
119        if sections.is_empty() {
120            document.add_section(default_empty_section());
121        } else {
122            for section in sections {
123                document.add_section(section);
124            }
125        }
126
127        Ok(document)
128    }
129
130    /// Decodes markdown using the built-in default template.
131    ///
132    /// This is a convenience wrapper around [`Self::decode`] that uses
133    /// [`builtin_default()`](hwpforge_blueprint::builtins::builtin_default)
134    /// so callers don't need to construct a template manually.
135    pub fn decode_with_default(markdown: &str) -> MdResult<MdDocument> {
136        let template = builtin_default()?;
137        Self::decode(markdown, &template)
138    }
139
140    /// Reads a markdown file and decodes it into a Core draft document with styles.
141    ///
142    /// Files larger than 50 MB are rejected with [`MdError::FileTooLarge`].
143    pub fn decode_file(path: impl AsRef<Path>, template: &Template) -> MdResult<MdDocument> {
144        let markdown = read_checked(path.as_ref())?;
145        Self::decode(&markdown, template)
146    }
147
148    /// Reads a markdown file and decodes it using the built-in default template.
149    ///
150    /// Files larger than 50 MB are rejected with [`MdError::FileTooLarge`].
151    pub fn decode_file_with_default(path: impl AsRef<Path>) -> MdResult<MdDocument> {
152        let template = builtin_default()?;
153        Self::decode_file(path, &template)
154    }
155
156    /// Reads a lossless markdown file and decodes it into a Core draft document.
157    ///
158    /// Files larger than 50 MB are rejected with [`MdError::FileTooLarge`].
159    pub fn decode_lossless_file(path: impl AsRef<Path>) -> MdResult<Document> {
160        let markdown = read_checked(path.as_ref())?;
161        Self::decode_lossless(&markdown)
162    }
163}
164
165#[derive(Debug, Clone)]
166struct ListState {
167    ordered: bool,
168    next_index: u64,
169}
170
171impl ListState {
172    fn new(start: Option<u64>) -> Self {
173        Self { ordered: start.is_some(), next_index: start.unwrap_or(1) }
174    }
175}
176
177#[derive(Debug, Clone)]
178struct PendingLink {
179    dest_url: String,
180    text: String,
181}
182
183#[derive(Debug, Clone)]
184struct PendingImage {
185    dest_url: String,
186    alt: String,
187}
188
189#[derive(Debug, Clone)]
190struct ParagraphBuilder {
191    style: MdStyleRef,
192    runs: Vec<Run>,
193    heading_level: Option<u8>,
194}
195
196impl ParagraphBuilder {
197    fn new(style: MdStyleRef) -> Self {
198        Self { style, runs: Vec::new(), heading_level: None }
199    }
200
201    fn push_text(&mut self, text: &str) {
202        if text.is_empty() {
203            return;
204        }
205
206        if let Some(last) = self.runs.last_mut() {
207            if let RunContent::Text(existing) = &mut last.content {
208                if last.char_shape_id == self.style.char_shape_id {
209                    existing.push_str(text);
210                    return;
211                }
212            }
213        }
214
215        self.runs.push(Run::text(text, self.style.char_shape_id));
216    }
217
218    fn push_run(&mut self, run: Run) {
219        self.runs.push(run);
220    }
221
222    fn build(mut self) -> Paragraph {
223        if self.runs.is_empty() {
224            self.runs.push(Run::text("", self.style.char_shape_id));
225        }
226        let mut para = Paragraph::with_runs(self.runs, self.style.para_shape_id);
227        para.heading_level = self.heading_level;
228        if let Some(level) = self.heading_level {
229            if (1..=7).contains(&level) {
230                // 개요 N is at style index N+1 (바탕글=0, 본문=1, 개요1=2, ...)
231                para.style_id = Some(StyleIndex::new((level as usize) + 1));
232            }
233        }
234        para
235    }
236}
237
238#[derive(Debug, Clone)]
239struct TableBuilder {
240    rows: Vec<Vec<Vec<Run>>>,
241    current_row: Vec<Vec<Run>>,
242    current_cell: Vec<Run>,
243    row_open: bool,
244    cell_open: bool,
245}
246
247impl TableBuilder {
248    fn new() -> Self {
249        Self {
250            rows: Vec::new(),
251            current_row: Vec::new(),
252            current_cell: Vec::new(),
253            row_open: false,
254            cell_open: false,
255        }
256    }
257
258    fn start_row(&mut self) {
259        self.end_cell();
260        self.end_row();
261        self.current_row.clear();
262        self.row_open = true;
263    }
264
265    fn end_row(&mut self) {
266        self.end_cell();
267        if self.row_open {
268            self.rows.push(std::mem::take(&mut self.current_row));
269            self.row_open = false;
270        }
271    }
272
273    fn start_cell(&mut self) {
274        self.end_cell();
275        self.current_cell = Vec::new();
276        self.cell_open = true;
277    }
278
279    fn end_cell(&mut self) {
280        if self.cell_open {
281            self.current_row.push(std::mem::take(&mut self.current_cell));
282            self.cell_open = false;
283        }
284    }
285
286    fn push_text_with_style(&mut self, text: &str, char_shape_id: CharShapeIndex) {
287        if !self.cell_open || text.is_empty() {
288            return;
289        }
290
291        if let Some(last) = self.current_cell.last_mut() {
292            if let RunContent::Text(existing) = &mut last.content {
293                if last.char_shape_id == char_shape_id {
294                    existing.push_str(text);
295                    return;
296                }
297            }
298        }
299
300        self.current_cell.push(Run::text(text, char_shape_id));
301    }
302
303    fn push_run(&mut self, run: Run) {
304        if self.cell_open {
305            self.current_cell.push(run);
306        }
307    }
308
309    fn is_in_cell(&self) -> bool {
310        self.cell_open
311    }
312
313    fn into_table(
314        mut self,
315        body_style: MdStyleRef,
316        page: hwpforge_core::PageSettings,
317    ) -> MdResult<Table> {
318        self.end_row();
319
320        if self.rows.is_empty() {
321            self.rows.push(vec![vec![Run::text("", body_style.char_shape_id)]]);
322        }
323
324        let max_cols = self.rows.iter().map(Vec::len).max().unwrap_or(1).max(1);
325        if max_cols > 10_000 {
326            return Err(MdError::UnsupportedStructure {
327                detail: format!("table has too many columns: {max_cols}"),
328            });
329        }
330        let divisor = i32::try_from(max_cols).unwrap_or(1);
331        let mut cell_width = page.printable_width() / divisor;
332        if cell_width.as_i32() <= 0 {
333            cell_width = HwpUnit::from_mm(40.0)?;
334        }
335
336        let table_rows = self
337            .rows
338            .into_iter()
339            .map(|mut row| {
340                if row.is_empty() {
341                    row.push(vec![Run::text("", body_style.char_shape_id)]);
342                }
343                while row.len() < max_cols {
344                    row.push(vec![Run::text("", body_style.char_shape_id)]);
345                }
346
347                let cells = row
348                    .into_iter()
349                    .map(|runs| {
350                        let runs = if runs.is_empty() {
351                            vec![Run::text("", body_style.char_shape_id)]
352                        } else {
353                            runs
354                        };
355                        let paragraph = Paragraph::with_runs(runs, body_style.para_shape_id);
356                        TableCell::new(vec![paragraph], cell_width)
357                    })
358                    .collect();
359
360                TableRow { cells, height: None }
361            })
362            .collect();
363
364        Ok(Table::new(table_rows))
365    }
366}
367
368#[derive(Debug)]
369struct DecoderState<'a> {
370    mapping: &'a MdMapping,
371    paragraphs: Vec<Paragraph>,
372    current: Option<ParagraphBuilder>,
373    table: Option<TableBuilder>,
374    blockquote_depth: usize,
375    in_code_block: bool,
376    in_item: bool,
377    pending_item_prefixes: Vec<Option<String>>,
378    list_stack: Vec<ListState>,
379    pending_link: Option<PendingLink>,
380    pending_image: Option<PendingImage>,
381    section_breaks: Vec<usize>,
382}
383
384#[derive(Debug)]
385struct DecodeOutput {
386    paragraphs: Vec<Paragraph>,
387    section_breaks: Vec<usize>,
388}
389
390impl<'a> DecoderState<'a> {
391    fn new(mapping: &'a MdMapping) -> Self {
392        Self {
393            mapping,
394            paragraphs: Vec::new(),
395            current: None,
396            table: None,
397            blockquote_depth: 0,
398            in_code_block: false,
399            in_item: false,
400            pending_item_prefixes: Vec::new(),
401            list_stack: Vec::new(),
402            pending_link: None,
403            pending_image: None,
404            section_breaks: Vec::new(),
405        }
406    }
407
408    fn decode_markdown(&mut self, content: &str) -> MdResult<()> {
409        let mut options = Options::empty();
410        options.insert(Options::ENABLE_TABLES);
411        options.insert(Options::ENABLE_STRIKETHROUGH);
412        options.insert(Options::ENABLE_TASKLISTS);
413        options.insert(Options::ENABLE_FOOTNOTES);
414        options.insert(Options::ENABLE_DEFINITION_LIST);
415        options.insert(Options::ENABLE_GFM);
416
417        let parser = Parser::new_ext(content, options);
418        for event in parser {
419            self.handle_event(event)?;
420        }
421        Ok(())
422    }
423
424    fn finish(mut self) -> MdResult<DecodeOutput> {
425        if self.table.is_some() {
426            return Err(MdError::UnsupportedStructure {
427                detail: "table was not properly closed".to_string(),
428            });
429        }
430
431        self.finalize_paragraph();
432        if self.paragraphs.is_empty() {
433            self.paragraphs.push(ParagraphBuilder::new(self.mapping.body).build());
434        }
435        Ok(DecodeOutput { paragraphs: self.paragraphs, section_breaks: self.section_breaks })
436    }
437
438    fn handle_event(&mut self, event: Event<'_>) -> MdResult<()> {
439        match event {
440            Event::Start(tag) => self.start_tag(tag)?,
441            Event::End(tag_end) => self.end_tag(tag_end)?,
442            Event::Text(text) => self.push_text(text.as_ref())?,
443            Event::Code(code) => self.push_inline_code(code.as_ref())?,
444            Event::InlineMath(math) | Event::DisplayMath(math) => self.push_text(math.as_ref())?,
445            Event::Html(html) | Event::InlineHtml(html) => {
446                let raw = html.as_ref().trim();
447                if raw == SECTION_MARKER_COMMENT && !self.is_in_table_cell() {
448                    self.push_section_marker();
449                } else {
450                    return Err(unsupported_markdown_feature("raw HTML"));
451                }
452            }
453            Event::FootnoteReference(label) => {
454                return Err(unsupported_markdown_feature(&format!(
455                    "footnote reference '[^{}]'",
456                    label.as_ref()
457                )));
458            }
459            Event::SoftBreak => self.push_soft_break()?,
460            Event::HardBreak => self.push_hard_break()?,
461            Event::Rule => self.push_rule(),
462            Event::TaskListMarker(checked) => {
463                self.push_text(if checked { "[x] " } else { "[ ] " })?;
464            }
465        }
466
467        Ok(())
468    }
469
470    fn start_tag(&mut self, tag: Tag<'_>) -> MdResult<()> {
471        match tag {
472            Tag::Paragraph => {
473                self.ensure_paragraph();
474            }
475            Tag::Heading { level, .. } => {
476                let lvl = level_to_u32(level);
477                self.start_paragraph(self.mapping.heading(lvl));
478                if let Some(current) = self.current.as_mut() {
479                    current.heading_level = Some(lvl as u8);
480                }
481            }
482            Tag::BlockQuote(_) => {
483                self.blockquote_depth += 1;
484            }
485            Tag::CodeBlock(_) => {
486                self.in_code_block = true;
487                self.start_paragraph(self.mapping.code);
488            }
489            Tag::List(start) => {
490                self.list_stack.push(ListState::new(start));
491            }
492            Tag::Item => {
493                self.finalize_paragraph();
494                self.in_item = true;
495                let prefix = self.next_item_prefix();
496                self.pending_item_prefixes.push(Some(prefix));
497            }
498            Tag::Table(_) => {
499                self.materialize_pending_item_prefix_if_needed();
500                self.finalize_paragraph();
501                self.table = Some(TableBuilder::new());
502            }
503            Tag::TableHead => {}
504            Tag::TableRow => {
505                if let Some(table) = self.table.as_mut() {
506                    table.start_row();
507                }
508            }
509            Tag::TableCell => {
510                if let Some(table) = self.table.as_mut() {
511                    table.start_cell();
512                }
513            }
514            Tag::Link { dest_url, .. } => {
515                if !self.is_in_table_cell() {
516                    self.ensure_paragraph();
517                }
518                // Only create a hyperlink for safe URL schemes. Unsafe URLs
519                // (javascript:, data:, file:, etc.) are emitted as plain text
520                // by leaving pending_link as None and letting the text pass through.
521                if is_safe_url(&dest_url) {
522                    self.pending_link =
523                        Some(PendingLink { dest_url: dest_url.to_string(), text: String::new() });
524                } else {
525                    // Store the URL in pending_link with an empty sentinel so
526                    // we can still collect the link text, but mark it unsafe
527                    // by prefixing with '\0' (never a valid URL character).
528                    self.pending_link = Some(PendingLink {
529                        dest_url: format!("\x00{}", dest_url),
530                        text: String::new(),
531                    });
532                }
533            }
534            Tag::Image { dest_url, .. } => {
535                if !self.is_in_table_cell() {
536                    self.ensure_paragraph();
537                }
538                self.pending_image =
539                    Some(PendingImage { dest_url: dest_url.to_string(), alt: String::new() });
540            }
541            Tag::Emphasis
542            | Tag::Strong
543            | Tag::Strikethrough
544            | Tag::HtmlBlock
545            | Tag::Superscript
546            | Tag::Subscript => {}
547            Tag::FootnoteDefinition(_) => {
548                return Err(unsupported_markdown_feature("footnote definition"));
549            }
550            Tag::DefinitionList => {
551                return Err(unsupported_markdown_feature("definition list"));
552            }
553            Tag::DefinitionListTitle => {
554                return Err(unsupported_markdown_feature("definition list title"));
555            }
556            Tag::DefinitionListDefinition => {
557                return Err(unsupported_markdown_feature("definition list definition"));
558            }
559            Tag::MetadataBlock(_) => {
560                return Err(unsupported_markdown_feature("metadata block"));
561            }
562        }
563        Ok(())
564    }
565
566    fn end_tag(&mut self, tag_end: TagEnd) -> MdResult<()> {
567        match tag_end {
568            TagEnd::Paragraph => self.finalize_paragraph(),
569            TagEnd::Heading(_) => self.finalize_paragraph(),
570            TagEnd::BlockQuote(_) => {
571                self.blockquote_depth = self.blockquote_depth.saturating_sub(1);
572            }
573            TagEnd::CodeBlock => {
574                self.in_code_block = false;
575                self.finalize_paragraph();
576            }
577            TagEnd::List(_) => {
578                self.list_stack.pop();
579            }
580            TagEnd::Item => {
581                self.finalize_paragraph();
582                if let Some(Some(prefix)) = self.pending_item_prefixes.pop() {
583                    let mut paragraph = ParagraphBuilder::new(self.mapping.list_item);
584                    paragraph.push_text(prefix.trim_end());
585                    self.paragraphs.push(paragraph.build());
586                }
587                self.in_item = !self.pending_item_prefixes.is_empty();
588            }
589            TagEnd::Table => self.finalize_table()?,
590            TagEnd::TableHead => {}
591            TagEnd::TableRow => {
592                if let Some(table) = self.table.as_mut() {
593                    table.end_row();
594                }
595            }
596            TagEnd::TableCell => {
597                if let Some(table) = self.table.as_mut() {
598                    table.end_cell();
599                }
600            }
601            TagEnd::Link => {
602                if let Some(link) = self.pending_link.take() {
603                    let char_shape_id = self.current_char_shape_id();
604                    if link.dest_url.starts_with('\x00') {
605                        // Unsafe URL: emit the link text as plain text only.
606                        if !link.text.is_empty() {
607                            self.push_run_to_active_context(Run::text(link.text, char_shape_id));
608                        }
609                    } else {
610                        self.push_run_to_active_context(Run::control(
611                            Control::Hyperlink { text: link.text, url: link.dest_url },
612                            char_shape_id,
613                        ));
614                    }
615                }
616            }
617            TagEnd::Image => {
618                if let Some(image) = self.pending_image.take() {
619                    let format = image_format_from_path(&image.dest_url);
620                    let image = Image::new(
621                        image.dest_url,
622                        HwpUnit::from_mm(50.0)?,
623                        HwpUnit::from_mm(30.0)?,
624                        format,
625                    );
626                    let char_shape_id = self.current_char_shape_id();
627                    self.push_run_to_active_context(Run::image(image, char_shape_id));
628                }
629            }
630            TagEnd::Emphasis
631            | TagEnd::Strong
632            | TagEnd::Strikethrough
633            | TagEnd::HtmlBlock
634            | TagEnd::Superscript
635            | TagEnd::Subscript => {}
636            TagEnd::FootnoteDefinition => {
637                return Err(unsupported_markdown_feature("footnote definition"));
638            }
639            TagEnd::DefinitionList => {
640                return Err(unsupported_markdown_feature("definition list"));
641            }
642            TagEnd::DefinitionListTitle => {
643                return Err(unsupported_markdown_feature("definition list title"));
644            }
645            TagEnd::DefinitionListDefinition => {
646                return Err(unsupported_markdown_feature("definition list definition"));
647            }
648            TagEnd::MetadataBlock(_) => {
649                return Err(unsupported_markdown_feature("metadata block"));
650            }
651        }
652
653        Ok(())
654    }
655
656    fn push_text(&mut self, text: &str) -> MdResult<()> {
657        if let Some(image) = self.pending_image.as_mut() {
658            image.alt.push_str(text);
659            if let Some(link) = self.pending_link.as_mut() {
660                link.text.push_str(text);
661            }
662            return Ok(());
663        }
664
665        if let Some(link) = self.pending_link.as_mut() {
666            link.text.push_str(text);
667            return Ok(());
668        }
669
670        let char_shape_id = self.current_char_shape_id();
671        if let Some(table) = self.table.as_mut() {
672            if table.is_in_cell() {
673                table.push_text_with_style(text, char_shape_id);
674                return Ok(());
675            }
676        }
677
678        self.ensure_paragraph();
679        if let Some(current) = self.current.as_mut() {
680            current.push_text(text);
681        }
682
683        Ok(())
684    }
685
686    fn push_inline_code(&mut self, code: &str) -> MdResult<()> {
687        let char_shape_id = self.current_char_shape_id();
688        if let Some(table) = self.table.as_mut() {
689            if table.is_in_cell() {
690                table.push_text_with_style("`", char_shape_id);
691                table.push_text_with_style(code, char_shape_id);
692                table.push_text_with_style("`", char_shape_id);
693                return Ok(());
694            }
695        }
696
697        if self.in_code_block {
698            return self.push_text(code);
699        }
700
701        if let Some(link) = self.pending_link.as_mut() {
702            link.text.push_str(code);
703            return Ok(());
704        }
705
706        self.ensure_paragraph();
707        if let Some(current) = self.current.as_mut() {
708            current.push_text("`");
709            current.push_text(code);
710            current.push_text("`");
711        }
712        Ok(())
713    }
714
715    fn push_soft_break(&mut self) -> MdResult<()> {
716        if self.in_code_block {
717            self.push_text("\n")
718        } else {
719            self.push_text(" ")
720        }
721    }
722
723    fn push_hard_break(&mut self) -> MdResult<()> {
724        self.push_text("\n")
725    }
726
727    fn push_rule(&mut self) {
728        self.finalize_paragraph();
729        let mut builder = ParagraphBuilder::new(self.mapping.body);
730        builder.push_text("---");
731        self.paragraphs.push(builder.build());
732    }
733
734    fn push_section_marker(&mut self) {
735        self.finalize_paragraph();
736        let split_at = self.paragraphs.len();
737        if split_at > 0 && self.section_breaks.last().copied() != Some(split_at) {
738            self.section_breaks.push(split_at);
739        }
740    }
741
742    fn finalize_table(&mut self) -> MdResult<()> {
743        let table_builder = self.table.take().ok_or_else(|| MdError::UnsupportedStructure {
744            detail: "table end tag without table start".to_string(),
745        })?;
746
747        let table = table_builder.into_table(self.mapping.body, self.mapping.page_settings)?;
748        let paragraph = Paragraph::with_runs(
749            vec![Run::table(table, self.mapping.body.char_shape_id)],
750            self.mapping.body.para_shape_id,
751        );
752        self.paragraphs.push(paragraph);
753        Ok(())
754    }
755
756    fn style_for_context(&self) -> MdStyleRef {
757        if self.in_code_block {
758            return self.mapping.code;
759        }
760        if self.in_item {
761            return self.mapping.list_item;
762        }
763        if self.blockquote_depth > 0 {
764            return self.mapping.blockquote;
765        }
766        self.mapping.body
767    }
768
769    fn current_char_shape_id(&self) -> CharShapeIndex {
770        self.current
771            .as_ref()
772            .map(|p| p.style.char_shape_id)
773            .unwrap_or(self.style_for_context().char_shape_id)
774    }
775
776    fn is_in_table_cell(&self) -> bool {
777        self.table.as_ref().map(TableBuilder::is_in_cell).unwrap_or(false)
778    }
779
780    fn push_run_to_active_context(&mut self, run: Run) {
781        if self.is_in_table_cell() {
782            if let Some(table) = self.table.as_mut() {
783                table.push_run(run);
784            }
785            return;
786        }
787
788        self.ensure_paragraph();
789        if let Some(current) = self.current.as_mut() {
790            current.push_run(run);
791        }
792    }
793
794    fn take_pending_item_prefix(&mut self) -> Option<String> {
795        self.pending_item_prefixes.last_mut().and_then(Option::take)
796    }
797
798    fn materialize_pending_item_prefix_if_needed(&mut self) {
799        if self.current.is_some() {
800            return;
801        }
802
803        if let Some(prefix) = self.take_pending_item_prefix() {
804            let mut paragraph = ParagraphBuilder::new(self.mapping.list_item);
805            paragraph.push_text(&prefix);
806            self.paragraphs.push(paragraph.build());
807        }
808    }
809
810    fn ensure_paragraph(&mut self) {
811        if self.current.is_none() {
812            let mut paragraph = ParagraphBuilder::new(self.style_for_context());
813            if let Some(prefix) = self.take_pending_item_prefix() {
814                paragraph.push_text(&prefix);
815            }
816            self.current = Some(paragraph);
817        }
818    }
819
820    fn start_paragraph(&mut self, style: MdStyleRef) {
821        self.finalize_paragraph();
822        let mut paragraph = ParagraphBuilder::new(style);
823        if let Some(prefix) = self.take_pending_item_prefix() {
824            paragraph.push_text(&prefix);
825        }
826        self.current = Some(paragraph);
827    }
828
829    fn finalize_paragraph(&mut self) {
830        if let Some(link) = self.pending_link.take() {
831            self.ensure_paragraph();
832            if let Some(current) = self.current.as_mut() {
833                current.push_text(&format!("[{}]({})", link.text, link.dest_url));
834            }
835        }
836
837        if let Some(image) = self.pending_image.take() {
838            self.ensure_paragraph();
839            if let Some(current) = self.current.as_mut() {
840                current.push_text(&format!("![{}]({})", image.alt, image.dest_url));
841            }
842        }
843
844        if let Some(current) = self.current.take() {
845            self.paragraphs.push(current.build());
846        }
847    }
848
849    fn next_item_prefix(&mut self) -> String {
850        if let Some(last) = self.list_stack.last_mut() {
851            if last.ordered {
852                let prefix = format!("{}. ", last.next_index);
853                last.next_index += 1;
854                return prefix;
855            }
856            return "- ".to_string();
857        }
858        "- ".to_string()
859    }
860}
861
862fn level_to_u32(level: HeadingLevel) -> u32 {
863    match level {
864        HeadingLevel::H1 => 1,
865        HeadingLevel::H2 => 2,
866        HeadingLevel::H3 => 3,
867        HeadingLevel::H4 => 4,
868        HeadingLevel::H5 => 5,
869        HeadingLevel::H6 => 6,
870    }
871}
872
873fn empty_paragraph(style: MdStyleRef) -> Paragraph {
874    Paragraph::with_runs(vec![Run::text("", style.char_shape_id)], style.para_shape_id)
875}
876
877fn default_empty_section() -> Section {
878    let paragraph =
879        Paragraph::with_runs(vec![Run::text("", CharShapeIndex::new(0))], ParaShapeIndex::new(0));
880    Section::with_paragraphs(vec![paragraph], hwpforge_core::PageSettings::a4())
881}
882
883fn unsupported_markdown_feature(feature: &str) -> MdError {
884    MdError::UnsupportedStructure { detail: format!("unsupported markdown feature: {feature}") }
885}
886
887/// Reads a file after checking that its size does not exceed [`MAX_MD_FILE_SIZE`].
888fn read_checked(path: &Path) -> MdResult<String> {
889    let metadata = std::fs::metadata(path)?;
890    let size = metadata.len();
891    if size > MAX_MD_FILE_SIZE {
892        return Err(MdError::FileTooLarge { size, limit: MAX_MD_FILE_SIZE });
893    }
894    Ok(std::fs::read_to_string(path)?)
895}
896
897fn split_sections(paragraphs: Vec<Paragraph>, section_breaks: &[usize]) -> Vec<Vec<Paragraph>> {
898    if paragraphs.is_empty() {
899        return Vec::new();
900    }
901
902    if section_breaks.is_empty() {
903        return vec![paragraphs];
904    }
905
906    let mut sections = Vec::new();
907    let mut start = 0usize;
908
909    for &break_idx in section_breaks {
910        if break_idx > start && break_idx <= paragraphs.len() {
911            sections.push(paragraphs[start..break_idx].to_vec());
912            start = break_idx;
913        }
914    }
915
916    if start < paragraphs.len() {
917        sections.push(paragraphs[start..].to_vec());
918    }
919
920    sections.into_iter().filter(|section| !section.is_empty()).collect()
921}
922
923#[cfg(test)]
924mod tests {
925    use super::*;
926    use crate::MdEncoder;
927    use hwpforge_blueprint::builtins::builtin_default;
928    use hwpforge_core::PageSettings;
929
930    fn default_template() -> Template {
931        builtin_default().unwrap()
932    }
933
934    #[test]
935    fn decode_heading_and_body() {
936        let template = default_template();
937        let (mapping, _) = resolve_mapping(&template).unwrap();
938        let markdown = "# Hello\n\nBody text";
939        let result = MdDecoder::decode(markdown, &template).unwrap();
940        let doc = &result.document;
941
942        assert_eq!(doc.sections().len(), 1);
943        let section = &doc.sections()[0];
944        assert_eq!(section.paragraphs.len(), 2);
945        assert_eq!(section.paragraphs[0].para_shape_id, mapping.heading1.para_shape_id);
946        assert_eq!(section.paragraphs[1].para_shape_id, mapping.body.para_shape_id);
947        assert_eq!(section.paragraphs[0].text_content(), "Hello");
948    }
949
950    #[test]
951    fn decode_returns_style_registry() {
952        let template = default_template();
953        let result = MdDecoder::decode("body text", &template).unwrap();
954        assert!(result.style_registry.font_count() > 0);
955        assert!(result.style_registry.char_shape_count() > 0);
956        assert!(result.style_registry.para_shape_count() > 0);
957    }
958
959    #[test]
960    fn decode_frontmatter_into_metadata() {
961        let template = default_template();
962        let markdown = "---\ntitle: My Proposal\nauthor: Kim\ndate: 2026-02-16\n---\n\nBody";
963        let result = MdDecoder::decode(markdown, &template).unwrap();
964
965        assert_eq!(result.document.metadata().title.as_deref(), Some("My Proposal"));
966        assert_eq!(result.document.metadata().author.as_deref(), Some("Kim"));
967        assert_eq!(result.document.metadata().created.as_deref(), Some("2026-02-16"));
968    }
969
970    #[test]
971    fn decode_table_into_table_run() {
972        let template = default_template();
973        let markdown = "| A | B |\n|---|---|\n| 1 | 2 |";
974        let doc = MdDecoder::decode(markdown, &template).unwrap().document;
975
976        let section = &doc.sections()[0];
977        let table_run = section
978            .paragraphs
979            .iter()
980            .flat_map(|p| p.runs.iter())
981            .find_map(|run| run.content.as_table())
982            .expect("table run");
983
984        assert!(table_run.row_count() >= 1);
985        assert_eq!(table_run.col_count(), 2);
986    }
987
988    #[test]
989    fn decode_link_and_image() {
990        let template = default_template();
991        let markdown = "[Rust](https://www.rust-lang.org) ![logo](logo.png)";
992        let doc = MdDecoder::decode(markdown, &template).unwrap().document;
993        let paragraph = &doc.sections()[0].paragraphs[0];
994
995        assert!(paragraph.runs.iter().any(|run| matches!(
996            run.content,
997            RunContent::Control(ref ctrl)
998                if matches!(
999                    ctrl.as_ref(),
1000                    Control::Hyperlink { url, .. } if url == "https://www.rust-lang.org"
1001                )
1002        )));
1003
1004        assert!(paragraph.runs.iter().any(|run| matches!(
1005            run.content,
1006            RunContent::Image(ref img) if img.path == "logo.png"
1007        )));
1008    }
1009
1010    #[test]
1011    fn unsafe_url_emitted_as_plain_text() {
1012        let template = default_template();
1013        // javascript: URL must NOT produce a Control::Hyperlink
1014        let markdown = "[click me](javascript:alert(1))";
1015        let doc = MdDecoder::decode(markdown, &template).unwrap().document;
1016        let paragraph = &doc.sections()[0].paragraphs[0];
1017
1018        // No hyperlink control should be present
1019        assert!(!paragraph.runs.iter().any(|run| matches!(
1020            run.content,
1021            RunContent::Control(ref ctrl) if matches!(ctrl.as_ref(), Control::Hyperlink { .. })
1022        )));
1023
1024        // The link text "click me" should appear as plain text
1025        assert!(paragraph.runs.iter().any(|run| matches!(
1026            &run.content,
1027            RunContent::Text(t) if t == "click me"
1028        )));
1029    }
1030
1031    #[test]
1032    fn unsafe_data_url_emitted_as_plain_text() {
1033        let template = default_template();
1034        let markdown = "[xss](data:text/html,<script>alert(1)</script>)";
1035        let doc = MdDecoder::decode(markdown, &template).unwrap().document;
1036        let paragraph = &doc.sections()[0].paragraphs[0];
1037
1038        assert!(!paragraph.runs.iter().any(|run| matches!(
1039            run.content,
1040            RunContent::Control(ref ctrl) if matches!(ctrl.as_ref(), Control::Hyperlink { .. })
1041        )));
1042    }
1043
1044    #[test]
1045    fn unsafe_file_url_emitted_as_plain_text() {
1046        let template = default_template();
1047        let markdown = "[secret](file:///etc/passwd)";
1048        let doc = MdDecoder::decode(markdown, &template).unwrap().document;
1049        let paragraph = &doc.sections()[0].paragraphs[0];
1050
1051        // Should NOT produce a Hyperlink control
1052        assert!(!paragraph.runs.iter().any(|run| matches!(
1053            run.content,
1054            RunContent::Control(ref ctrl) if matches!(ctrl.as_ref(), Control::Hyperlink { .. })
1055        )));
1056        // Should contain the link text as plain text
1057        assert!(paragraph.runs.iter().any(|run| matches!(
1058            &run.content,
1059            RunContent::Text(t) if t == "secret"
1060        )));
1061    }
1062
1063    #[test]
1064    fn decode_linked_image_keeps_hyperlink_text() {
1065        let template = default_template();
1066        let markdown = "[![logo](logo.png)](https://example.com)";
1067        let doc = MdDecoder::decode(markdown, &template).unwrap().document;
1068        let paragraph = &doc.sections()[0].paragraphs[0];
1069
1070        assert!(paragraph.runs.iter().any(|run| matches!(
1071            run.content,
1072            RunContent::Image(ref img) if img.path == "logo.png"
1073        )));
1074
1075        assert!(paragraph.runs.iter().any(|run| matches!(
1076            run.content,
1077            RunContent::Control(ref ctrl)
1078                if matches!(
1079                    ctrl.as_ref(),
1080                    Control::Hyperlink { text, url }
1081                        if text == "logo" && url == "https://example.com"
1082                )
1083        )));
1084    }
1085
1086    #[test]
1087    fn decode_empty_markdown_creates_placeholder_paragraph() {
1088        let template = default_template();
1089        let doc = MdDecoder::decode("", &template).unwrap().document;
1090
1091        assert_eq!(doc.sections().len(), 1);
1092        assert_eq!(doc.sections()[0].paragraphs.len(), 1);
1093        assert_eq!(doc.sections()[0].paragraphs[0].runs.len(), 1);
1094    }
1095
1096    #[test]
1097    fn decode_ordered_list_prefix_increments() {
1098        let template = default_template();
1099        let markdown = "1. alpha\n2. beta";
1100        let doc = MdDecoder::decode(markdown, &template).unwrap().document;
1101        let texts: Vec<String> =
1102            doc.sections()[0].paragraphs.iter().map(Paragraph::text_content).collect();
1103
1104        assert_eq!(texts, vec!["1. alpha", "2. beta"]);
1105    }
1106
1107    #[test]
1108    fn decode_section_marker_comment_splits_sections() {
1109        let template = default_template();
1110        let markdown = "First\n\n<!-- hwpforge:section -->\n\nSecond";
1111        let doc = MdDecoder::decode(markdown, &template).unwrap().document;
1112
1113        assert_eq!(doc.sections().len(), 2);
1114        assert_eq!(doc.sections()[0].paragraphs[0].text_content(), "First");
1115        assert_eq!(doc.sections()[1].paragraphs[0].text_content(), "Second");
1116    }
1117
1118    #[test]
1119    fn decode_table_cell_link_preserves_control_run() {
1120        let template = default_template();
1121        let markdown = "| Link |\n|---|\n| [Rust](https://www.rust-lang.org) |";
1122        let doc = MdDecoder::decode(markdown, &template).unwrap().document;
1123
1124        let section = &doc.sections()[0];
1125        let table_run = section
1126            .paragraphs
1127            .iter()
1128            .flat_map(|p| p.runs.iter())
1129            .find_map(|run| run.content.as_table())
1130            .expect("table run");
1131
1132        let cell_paragraph = &table_run.rows[0].cells[0].paragraphs[0];
1133        assert!(cell_paragraph.runs.iter().any(|run| matches!(
1134            run.content,
1135            RunContent::Control(ref ctrl)
1136                if matches!(
1137                    ctrl.as_ref(),
1138                    Control::Hyperlink { text, url }
1139                        if text == "Rust" && url == "https://www.rust-lang.org"
1140                )
1141        )));
1142
1143        let top_level_control_count = section
1144            .paragraphs
1145            .iter()
1146            .flat_map(|p| p.runs.iter())
1147            .filter(|run| matches!(run.content, RunContent::Control(_)))
1148            .count();
1149        assert_eq!(top_level_control_count, 0);
1150    }
1151
1152    #[test]
1153    fn decode_table_cell_image_preserves_image_run() {
1154        let template = default_template();
1155        let markdown = "| Img |\n|---|\n| ![logo](logo.png) |";
1156        let doc = MdDecoder::decode(markdown, &template).unwrap().document;
1157
1158        let table = doc.sections()[0].paragraphs[0].runs[0].content.as_table().unwrap();
1159        let cell_runs = &table.rows[0].cells[0].paragraphs[0].runs;
1160        assert!(cell_runs.iter().any(
1161            |run| matches!(run.content, RunContent::Image(ref img) if img.path == "logo.png")
1162        ));
1163    }
1164
1165    #[test]
1166    fn decode_footnote_reference_returns_unsupported_structure_error() {
1167        let template = default_template();
1168        let markdown = "Body[^1]\n\n[^1]: note";
1169        let err = MdDecoder::decode(markdown, &template).unwrap_err();
1170
1171        assert!(matches!(
1172            err,
1173            MdError::UnsupportedStructure { ref detail }
1174                if detail.contains("footnote reference")
1175        ));
1176    }
1177
1178    #[test]
1179    fn decode_definition_list_returns_unsupported_structure_error() {
1180        let template = default_template();
1181        let markdown = "Term\n: Definition";
1182        let err = MdDecoder::decode(markdown, &template).unwrap_err();
1183
1184        assert!(matches!(
1185            err,
1186            MdError::UnsupportedStructure { ref detail }
1187                if detail.contains("definition list")
1188        ));
1189    }
1190
1191    #[test]
1192    fn decode_raw_html_returns_unsupported_structure_error() {
1193        let template = default_template();
1194        let markdown = "<div>raw</div>";
1195        let err = MdDecoder::decode(markdown, &template).unwrap_err();
1196
1197        assert!(matches!(
1198            err,
1199            MdError::UnsupportedStructure { ref detail }
1200                if detail.contains("raw HTML")
1201        ));
1202    }
1203
1204    #[test]
1205    fn decode_lossless_reconstructs_core_structure() {
1206        let mut draft = Document::new();
1207        draft.metadata_mut().title = Some("Lossless".to_string());
1208        draft.add_section(Section::with_paragraphs(
1209            vec![Paragraph::with_runs(
1210                vec![
1211                    Run::text("A", CharShapeIndex::new(3)),
1212                    Run::control(
1213                        Control::Hyperlink {
1214                            text: "Rust".to_string(),
1215                            url: "https://www.rust-lang.org".to_string(),
1216                        },
1217                        CharShapeIndex::new(4),
1218                    ),
1219                ],
1220                ParaShapeIndex::new(2),
1221            )],
1222            PageSettings::a4(),
1223        ));
1224
1225        let validated = draft.validate().unwrap();
1226        let markdown = MdEncoder::encode_lossless(&validated).unwrap();
1227        let decoded = MdDecoder::decode_lossless(&markdown).unwrap();
1228
1229        assert_eq!(decoded.metadata().title.as_deref(), Some("Lossless"));
1230        assert_eq!(decoded.sections().len(), 1);
1231        assert_eq!(decoded.sections()[0].paragraphs[0].para_shape_id.get(), 2);
1232        assert!(decoded.sections()[0].paragraphs[0].runs.iter().any(|run| matches!(
1233            run.content,
1234            RunContent::Control(ref ctrl)
1235                if matches!(
1236                    ctrl.as_ref(),
1237                    Control::Hyperlink { text, url }
1238                        if text == "Rust" && url == "https://www.rust-lang.org"
1239                )
1240        )));
1241    }
1242
1243    #[test]
1244    fn decode_nested_list_keeps_outer_prefix_progression() {
1245        let template = default_template();
1246        let markdown = "1.\n   - child\n2. next";
1247        let doc = MdDecoder::decode(markdown, &template).unwrap().document;
1248        let texts: Vec<String> =
1249            doc.sections()[0].paragraphs.iter().map(Paragraph::text_content).collect();
1250
1251        assert!(texts.iter().any(|text| text.starts_with("1.")));
1252        assert!(texts.iter().any(|text| text.starts_with("2. ")));
1253    }
1254
1255    #[test]
1256    fn decode_lossless_preserves_exact_hwpunit_geometry() {
1257        let mut page = PageSettings::a4();
1258        page.width = HwpUnit::new(59_529).unwrap();
1259        page.height = HwpUnit::new(84_190).unwrap();
1260        page.margin_left = HwpUnit::new(5_671).unwrap();
1261
1262        let mut draft = Document::new();
1263        draft.add_section(Section::with_paragraphs(
1264            vec![Paragraph::with_runs(
1265                vec![Run::text("x", CharShapeIndex::new(0))],
1266                ParaShapeIndex::new(0),
1267            )],
1268            page,
1269        ));
1270
1271        let encoded = MdEncoder::encode_lossless(&draft.validate().unwrap()).unwrap();
1272        let decoded = MdDecoder::decode_lossless(&encoded).unwrap();
1273        let restored = decoded.sections()[0].page_settings;
1274
1275        assert_eq!(restored.width.as_i32(), 59_529);
1276        assert_eq!(restored.height.as_i32(), 84_190);
1277        assert_eq!(restored.margin_left.as_i32(), 5_671);
1278    }
1279
1280    #[test]
1281    fn decode_with_default_uses_builtin_template() {
1282        let result = MdDecoder::decode_with_default("# 제목\n\n본문입니다.").unwrap();
1283        assert!(!result.document.sections().is_empty());
1284        assert!(result.style_registry.font_count() > 0);
1285    }
1286
1287    #[test]
1288    fn decode_file_with_default_reads_and_decodes() {
1289        let path = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"))
1290            .join("tests")
1291            .join("fixtures")
1292            .join("simple_body.md");
1293        let result = MdDecoder::decode_file_with_default(path).unwrap();
1294        assert_eq!(result.document.metadata().title.as_deref(), Some("Simple Body Test"));
1295    }
1296
1297    #[test]
1298    fn h1_heading_sets_style_id_to_2() {
1299        use hwpforge_foundation::StyleIndex;
1300        let template = default_template();
1301        let result = MdDecoder::decode("# 제목", &template).unwrap();
1302        let section = &result.document.sections()[0];
1303        assert_eq!(section.paragraphs[0].style_id, Some(StyleIndex::new(2)));
1304    }
1305
1306    #[test]
1307    fn all_heading_levels_map_to_style_id() {
1308        use hwpforge_foundation::StyleIndex;
1309        let template = default_template();
1310        for level in 1u8..=6 {
1311            let md = format!("{} 제목{level}", "#".repeat(level as usize));
1312            let result = MdDecoder::decode(&md, &template).unwrap();
1313            let section = &result.document.sections()[0];
1314            assert_eq!(
1315                section.paragraphs[0].style_id,
1316                Some(StyleIndex::new((level as usize) + 1)),
1317                "H{level} should map to style_id {}",
1318                (level as usize) + 1
1319            );
1320        }
1321    }
1322
1323    #[test]
1324    fn body_paragraph_has_no_style_id() {
1325        let template = default_template();
1326        let result = MdDecoder::decode("본문입니다.", &template).unwrap();
1327        let section = &result.document.sections()[0];
1328        assert_eq!(section.paragraphs[0].style_id, None);
1329    }
1330}