hwpforge_smithy_hwpx/decoder/
mod.rs

1//! HWPX decoding pipeline.
2//!
3//! Submodules handle individual stages:
4//! - `package` — ZIP extraction and file access
5//! - `header` — `header.xml` parsing → [`HwpxStyleStore`]
6//! - `section` — `section*.xml` parsing → paragraphs + page settings
7
8pub(crate) mod chart;
9pub(crate) mod header;
10pub(crate) mod package;
11pub(crate) mod section;
12pub(crate) mod shapes;
13
14use std::path::Path;
15
16use hwpforge_core::document::{Document, Draft};
17use hwpforge_core::image::ImageStore;
18use hwpforge_core::section::Section;
19use hwpforge_core::PageSettings;
20
21use crate::error::HwpxResult;
22use crate::style_store::HwpxStyleStore;
23
24// ── HwpxDocument ─────────────────────────────────────────────────
25
26/// The result of decoding an HWPX file.
27///
28/// Contains the Core document (structure), the HWPX-specific style
29/// store (fonts, char shapes, para shapes from `header.xml`), and
30/// binary image data extracted from `BinData/` entries.
31#[derive(Debug)]
32#[non_exhaustive]
33pub struct HwpxDocument {
34    /// The decoded document in Core's DOM.
35    pub document: Document<Draft>,
36    /// Style information parsed from `header.xml`.
37    pub style_store: HwpxStyleStore,
38    /// Binary image data extracted from `BinData/` ZIP entries.
39    pub image_store: ImageStore,
40}
41
42// ── HwpxDecoder ──────────────────────────────────────────────────
43
44/// Decodes HWPX files (ZIP + XML) into Core's `Document<Draft>`.
45///
46/// # Examples
47///
48/// ```no_run
49/// use hwpforge_smithy_hwpx::HwpxDecoder;
50///
51/// let bytes = std::fs::read("document.hwpx").unwrap();
52/// let result = HwpxDecoder::decode(&bytes).unwrap();
53/// println!("Sections: {}", result.document.sections().len());
54/// ```
55pub struct HwpxDecoder;
56
57impl HwpxDecoder {
58    /// Decodes an HWPX file from raw bytes.
59    ///
60    /// Pipeline:
61    /// 1. Open ZIP archive, validate mimetype
62    /// 2. Parse `Contents/header.xml` → `HwpxStyleStore`
63    /// 3. Parse `Contents/section*.xml` → paragraphs + page settings
64    /// 4. Assemble `Document<Draft>` with sections
65    pub fn decode(bytes: &[u8]) -> HwpxResult<HwpxDocument> {
66        // Step 1: Open package
67        let mut pkg = package::PackageReader::new(bytes)?;
68
69        // Step 2: Parse header
70        let header_xml = pkg.read_header_xml()?;
71        let style_store = header::parse_header(&header_xml)?;
72
73        // Step 3: Extract chart XMLs from ZIP
74        let chart_xmls = pkg.read_chart_xmls()?;
75
76        // Step 4: Parse sections
77        let mut document = Document::<Draft>::new();
78        let section_count = pkg.section_count();
79
80        for i in 0..section_count {
81            let section_xml = pkg.read_section_xml(i)?;
82            let result = section::parse_section(&section_xml, i, &chart_xmls)?;
83
84            let page_settings = result.page_settings.unwrap_or_else(PageSettings::a4);
85
86            let section = Section {
87                paragraphs: result.paragraphs,
88                page_settings,
89                header: result.header,
90                footer: result.footer,
91                page_number: result.page_number,
92                column_settings: result.column_settings,
93                visibility: result.visibility,
94                line_number_shape: result.line_number_shape,
95                page_border_fills: result.page_border_fills,
96                master_pages: result.master_pages,
97                begin_num: None,
98                text_direction: result.text_direction,
99            };
100
101            document.add_section(section);
102        }
103
104        // Step 5: Extract binary image data from BinData/
105        let image_store = pkg.read_all_bindata()?;
106
107        Ok(HwpxDocument { document, style_store, image_store })
108    }
109
110    /// Decodes an HWPX file from a filesystem path.
111    pub fn decode_file(path: impl AsRef<Path>) -> HwpxResult<HwpxDocument> {
112        let bytes = std::fs::read(path.as_ref()).map_err(crate::error::HwpxError::Io)?;
113        Self::decode(&bytes)
114    }
115}
116
117#[cfg(test)]
118mod tests {
119    use super::*;
120    use std::io::{Cursor, Write};
121    use zip::write::SimpleFileOptions;
122    use zip::ZipWriter;
123
124    /// Creates a complete minimal HWPX for testing.
125    fn make_test_hwpx(header_xml: &str, section_xmls: &[&str]) -> Vec<u8> {
126        let buf = Vec::new();
127        let mut zip = ZipWriter::new(Cursor::new(buf));
128
129        let stored =
130            SimpleFileOptions::default().compression_method(zip::CompressionMethod::Stored);
131        let deflate = SimpleFileOptions::default();
132
133        zip.start_file("mimetype", stored).unwrap();
134        zip.write_all(b"application/hwp+zip").unwrap();
135
136        zip.start_file("Contents/header.xml", deflate).unwrap();
137        zip.write_all(header_xml.as_bytes()).unwrap();
138
139        for (i, xml) in section_xmls.iter().enumerate() {
140            let path = format!("Contents/section{}.xml", i);
141            zip.start_file(&path, deflate).unwrap();
142            zip.write_all(xml.as_bytes()).unwrap();
143        }
144
145        zip.finish().unwrap().into_inner()
146    }
147
148    const HEADER: &str = r##"<head version="1.4" secCnt="1">
149        <refList>
150            <fontfaces itemCnt="1">
151                <fontface lang="HANGUL" fontCnt="1">
152                    <font id="0" face="함초롬돋움" type="TTF" isEmbedded="0"/>
153                </fontface>
154            </fontfaces>
155            <charProperties itemCnt="1">
156                <charPr id="0" height="1000" textColor="#000000" shadeColor="none"
157                        useFontSpace="0" useKerning="0" symMark="NONE" borderFillIDRef="0">
158                    <fontRef hangul="0" latin="0" hanja="0" japanese="0" other="0" symbol="0" user="0"/>
159                </charPr>
160            </charProperties>
161            <paraProperties itemCnt="1">
162                <paraPr id="0">
163                    <align horizontal="LEFT" vertical="BASELINE"/>
164                    <switch><default>
165                        <lineSpacing type="PERCENT" value="160"/>
166                    </default></switch>
167                </paraPr>
168            </paraProperties>
169        </refList>
170    </head>"##;
171
172    const SECTION_TEXT: &str = r#"<sec>
173        <p paraPrIDRef="0">
174            <run charPrIDRef="0">
175                <secPr textDirection="HORIZONTAL">
176                    <pagePr landscape="WIDELY" width="59528" height="84188">
177                        <margin header="4252" footer="4252" gutter="0"
178                                left="8504" right="8504" top="5668" bottom="4252"/>
179                    </pagePr>
180                </secPr>
181                <t>안녕하세요</t>
182            </run>
183        </p>
184    </sec>"#;
185
186    // ── Full pipeline tests ──────────────────────────────────────
187
188    #[test]
189    fn decode_minimal_hwpx() {
190        let bytes = make_test_hwpx(HEADER, &[SECTION_TEXT]);
191        let result = HwpxDecoder::decode(&bytes).unwrap();
192
193        // Document structure
194        assert_eq!(result.document.sections().len(), 1);
195        let section = &result.document.sections()[0];
196        assert_eq!(section.paragraphs.len(), 1);
197
198        // Text content
199        let text = section.paragraphs[0].runs[0].content.as_text();
200        assert_eq!(text, Some("안녕하세요"));
201
202        // Page settings
203        assert_eq!(section.page_settings.width.as_i32(), 59528);
204        assert_eq!(section.page_settings.height.as_i32(), 84188);
205
206        // Style store
207        assert_eq!(result.style_store.font_count(), 1);
208        assert_eq!(result.style_store.char_shape_count(), 1);
209        assert_eq!(result.style_store.para_shape_count(), 1);
210    }
211
212    #[test]
213    fn decode_multiple_sections() {
214        let s0 = r#"<sec><p paraPrIDRef="0"><run charPrIDRef="0"><t>Section 0</t></run></p></sec>"#;
215        let s1 = r#"<sec><p paraPrIDRef="0"><run charPrIDRef="0"><t>Section 1</t></run></p></sec>"#;
216        let bytes = make_test_hwpx(HEADER, &[s0, s1]);
217        let result = HwpxDecoder::decode(&bytes).unwrap();
218        assert_eq!(result.document.sections().len(), 2);
219    }
220
221    #[test]
222    fn decode_with_table() {
223        let section = r#"<sec>
224            <p paraPrIDRef="0">
225                <run charPrIDRef="0">
226                    <tbl rowCnt="1" colCnt="1">
227                        <tr>
228                            <tc name="A1">
229                                <cellSz width="5000" height="1000"/>
230                                <subList><p paraPrIDRef="0"><run charPrIDRef="0"><t>Cell</t></run></p></subList>
231                            </tc>
232                        </tr>
233                    </tbl>
234                </run>
235            </p>
236        </sec>"#;
237        let bytes = make_test_hwpx(HEADER, &[section]);
238        let result = HwpxDecoder::decode(&bytes).unwrap();
239        let run = &result.document.sections()[0].paragraphs[0].runs[0];
240        assert!(run.content.is_table());
241    }
242
243    #[test]
244    fn decode_section_without_secpr_uses_a4_defaults() {
245        let section = r#"<sec><p paraPrIDRef="0"><run charPrIDRef="0"><t>Text</t></run></p></sec>"#;
246        let bytes = make_test_hwpx(HEADER, &[section]);
247        let result = HwpxDecoder::decode(&bytes).unwrap();
248        let ps = &result.document.sections()[0].page_settings;
249        assert_eq!(*ps, PageSettings::a4());
250    }
251
252    #[test]
253    fn decode_not_a_zip() {
254        let err = HwpxDecoder::decode(b"not a zip").unwrap_err();
255        assert!(matches!(err, crate::error::HwpxError::Zip(_)));
256    }
257
258    #[test]
259    fn decode_file_nonexistent() {
260        let err = HwpxDecoder::decode_file("/nonexistent/path.hwpx").unwrap_err();
261        assert!(matches!(err, crate::error::HwpxError::Io(_)));
262    }
263
264    // ── Header / Footer / PageNum decode tests ──────────────────
265
266    #[test]
267    fn decode_section_with_header_ctrl() {
268        let section = r#"<sec>
269            <p paraPrIDRef="0">
270                <run charPrIDRef="0">
271                    <ctrl>
272                        <header id="0" applyPageType="BOTH">
273                            <subList id="0" textDirection="HORIZONTAL" lineWrap="BREAK" vertAlign="TOP"
274                                     linkListIDRef="0" linkListNextIDRef="0" textWidth="0" textHeight="0">
275                                <p paraPrIDRef="0">
276                                    <run charPrIDRef="0"><t>Page Header</t></run>
277                                </p>
278                            </subList>
279                        </header>
280                    </ctrl>
281                    <t>Body text</t>
282                </run>
283            </p>
284        </sec>"#;
285        let bytes = make_test_hwpx(HEADER, &[section]);
286        let result = HwpxDecoder::decode(&bytes).unwrap();
287
288        let sec = &result.document.sections()[0];
289        let header = sec.header.as_ref().expect("section should have header");
290        assert_eq!(header.apply_page_type, hwpforge_foundation::ApplyPageType::Both);
291        assert_eq!(header.paragraphs.len(), 1);
292        assert_eq!(header.paragraphs[0].runs[0].content.as_text(), Some("Page Header"));
293    }
294
295    #[test]
296    fn decode_section_with_footer_and_pagenum() {
297        let section = r#"<sec>
298            <p paraPrIDRef="0">
299                <run charPrIDRef="0">
300                    <ctrl>
301                        <footer id="0" applyPageType="ODD">
302                            <subList id="0" textDirection="HORIZONTAL" lineWrap="BREAK" vertAlign="TOP"
303                                     linkListIDRef="0" linkListNextIDRef="0" textWidth="0" textHeight="0">
304                                <p paraPrIDRef="0">
305                                    <run charPrIDRef="0"><t>Footer</t></run>
306                                </p>
307                            </subList>
308                        </footer>
309                    </ctrl>
310                    <ctrl>
311                        <pageNum pos="BOTTOM_CENTER" formatType="DIGIT" sideChar="- "/>
312                    </ctrl>
313                    <t>Body</t>
314                </run>
315            </p>
316        </sec>"#;
317        let bytes = make_test_hwpx(HEADER, &[section]);
318        let result = HwpxDecoder::decode(&bytes).unwrap();
319
320        let sec = &result.document.sections()[0];
321        let footer = sec.footer.as_ref().expect("section should have footer");
322        assert_eq!(footer.apply_page_type, hwpforge_foundation::ApplyPageType::Odd);
323        assert_eq!(footer.paragraphs[0].runs[0].content.as_text(), Some("Footer"));
324
325        let pn = sec.page_number.as_ref().expect("section should have page number");
326        assert_eq!(pn.position, hwpforge_foundation::PageNumberPosition::BottomCenter);
327        assert_eq!(pn.number_format, hwpforge_foundation::NumberFormatType::Digit);
328        assert_eq!(pn.decoration, "- ");
329    }
330
331    // ── Image binary roundtrip test ─────────────────────────────
332
333    #[test]
334    fn decode_extracts_bindata_images() {
335        let buf = Vec::new();
336        let mut zip = ZipWriter::new(Cursor::new(buf));
337        let stored =
338            SimpleFileOptions::default().compression_method(zip::CompressionMethod::Stored);
339        let deflate = SimpleFileOptions::default();
340
341        zip.start_file("mimetype", stored).unwrap();
342        zip.write_all(b"application/hwp+zip").unwrap();
343
344        zip.start_file("Contents/header.xml", deflate).unwrap();
345        zip.write_all(HEADER.as_bytes()).unwrap();
346
347        let section = r#"<sec><p paraPrIDRef="0"><run charPrIDRef="0"><t>Body</t></run></p></sec>"#;
348        zip.start_file("Contents/section0.xml", deflate).unwrap();
349        zip.write_all(section.as_bytes()).unwrap();
350
351        // Add a BinData image
352        let fake_png = vec![0x89, 0x50, 0x4E, 0x47]; // PNG magic bytes
353        zip.start_file("BinData/logo.png", stored).unwrap();
354        zip.write_all(&fake_png).unwrap();
355
356        let bytes = zip.finish().unwrap().into_inner();
357        let result = HwpxDecoder::decode(&bytes).unwrap();
358
359        assert!(!result.image_store.is_empty(), "image store should contain extracted images");
360        let data = result.image_store.get("logo.png").expect("should find logo.png");
361        assert_eq!(data, &fake_png);
362    }
363}