diff --git a/pdf-table-extraction-docling-vs-llamaparse/docling_extraction.py b/pdf-table-extraction-docling-vs-llamaparse/docling_extraction.py new file mode 100644 index 0000000000..b49baaa0bf --- /dev/null +++ b/pdf-table-extraction-docling-vs-llamaparse/docling_extraction.py @@ -0,0 +1,22 @@ +"""Parse a PDF with Docling and print Markdown output.""" + +from pathlib import Path + +from docling.document_converter import DocumentConverter + +PDF_PATH = Path("sample_report.pdf") + + +def main() -> None: + converter = DocumentConverter() + result = converter.convert(PDF_PATH) + + markdown = result.document.export_to_markdown() + print(markdown[:3000]) + print("\n---\n") + print(f"Pages parsed: {len(result.document.pages)}") + print(f"Tables found: {len(result.document.tables)}") + + +if __name__ == "__main__": + main() diff --git a/pdf-table-extraction-docling-vs-llamaparse/docling_formats.py b/pdf-table-extraction-docling-vs-llamaparse/docling_formats.py new file mode 100644 index 0000000000..8166d5b850 --- /dev/null +++ b/pdf-table-extraction-docling-vs-llamaparse/docling_formats.py @@ -0,0 +1,34 @@ +"""Export Docling parse results to Markdown, JSON, HTML, and pandas DataFrames.""" + +import json +from pathlib import Path + +from docling.document_converter import DocumentConverter + +PDF_PATH = Path("sample_report.pdf") + + +def main() -> None: + converter = DocumentConverter() + document = converter.convert(PDF_PATH).document + + markdown = document.export_to_markdown() + Path("output_docling.md").write_text(markdown, encoding="utf-8") + + payload = document.export_to_dict() + Path("output_docling.json").write_text( + json.dumps(payload, indent=2), + encoding="utf-8", + ) + + html = document.export_to_html() + Path("output_docling.html").write_text(html, encoding="utf-8") + + for index, table in enumerate(document.tables): + frame = table.export_to_dataframe(doc=document) + print(f"Table {index} shape: {frame.shape}") + print(frame.head(), end="\n\n") + + +if __name__ == "__main__": + main() diff --git a/pdf-table-extraction-docling-vs-llamaparse/llamaparse_extraction.py b/pdf-table-extraction-docling-vs-llamaparse/llamaparse_extraction.py new file mode 100644 index 0000000000..8f10eae11a --- /dev/null +++ b/pdf-table-extraction-docling-vs-llamaparse/llamaparse_extraction.py @@ -0,0 +1,31 @@ +"""Parse a PDF with LlamaParse (llama-cloud SDK) and print Markdown output.""" + +import os +from pathlib import Path + +from llama_cloud import LlamaCloud + +PDF_PATH = Path("sample_report.pdf") + + +def main() -> None: + client = LlamaCloud(api_key=os.environ["LLAMA_CLOUD_API_KEY"]) + + uploaded = client.files.create(file=PDF_PATH, purpose="parse") + result = client.parsing.parse( + file_id=uploaded.id, + tier="agentic", + version="latest", + expand=["markdown"], + ) + + pages = "" + for page in result.markdown.pages: + pages += page.markdown + pages += "\n---\n" + + print(pages[:3000]) + print(f"Pages parsed: {len(result.markdown.pages)}") + +if __name__ == "__main__": + main() diff --git a/pdf-table-extraction-docling-vs-llamaparse/llamaparse_formats.py b/pdf-table-extraction-docling-vs-llamaparse/llamaparse_formats.py new file mode 100644 index 0000000000..f1848b6a1b --- /dev/null +++ b/pdf-table-extraction-docling-vs-llamaparse/llamaparse_formats.py @@ -0,0 +1,63 @@ +"""Export LlamaParse results to Markdown, Text, and schema-driven JSON.""" + +import json +import os +from pathlib import Path + +from llama_cloud import LlamaCloud +from pydantic import BaseModel, Field + +PDF_PATH = Path("sample_report.pdf") + + +class RevenueRow(BaseModel): + quarter: str = Field(description="Fiscal quarter label, e.g. Q1 2024") + revenue_millions: float = Field(description="Revenue in millions of USD") + growth_percent: float | None = Field( + default=None, + description="Year-over-year growth percentage if stated", + ) + + +class RevenueTable(BaseModel): + rows: list[RevenueRow] = Field(description="One row per quarter in the table") + + +def main() -> None: + client = LlamaCloud(api_key=os.environ["LLAMA_CLOUD_API_KEY"]) + + uploaded = client.files.create(file=PDF_PATH, purpose="parse") + + parsed = client.parsing.parse( + file_id=uploaded.id, + tier="agentic", + version="latest", + expand=["markdown", "text"], + ) + + markdown_pages = "\n\n".join(page.markdown for page in parsed.markdown.pages) + Path("output_llamaparse.md").write_text(markdown_pages, encoding="utf-8") + + if parsed.text and parsed.text.pages: + text_pages = "\n".join(page.text for page in parsed.text.pages) + Path("output_llamaparse.text").write_text(text_pages, encoding="utf-8") + + extract_file = client.files.create(file=PDF_PATH, purpose="extract") + job = client.extract.run( + file_input=extract_file.id, + configuration={ + "data_schema": RevenueTable.model_json_schema(), + "extraction_target": "per_doc", + "tier": "agentic", + }, + ) + + Path("output_llamaparse.json").write_text( + json.dumps(job.extract_result, indent=2), + encoding="utf-8", + ) + print(json.dumps(job.extract_result, indent=2)) + + +if __name__ == "__main__": + main() diff --git a/pdf-table-extraction-docling-vs-llamaparse/requirements.txt b/pdf-table-extraction-docling-vs-llamaparse/requirements.txt new file mode 100644 index 0000000000..26adf72b87 --- /dev/null +++ b/pdf-table-extraction-docling-vs-llamaparse/requirements.txt @@ -0,0 +1,4 @@ +docling==2.102.2 +llama-cloud>=2.9.0 +pandas>=2.0.0 +pydantic>=2.0.0 diff --git a/pdf-table-extraction-docling-vs-llamaparse/sample_report.pdf b/pdf-table-extraction-docling-vs-llamaparse/sample_report.pdf new file mode 100644 index 0000000000..bce6248ad1 Binary files /dev/null and b/pdf-table-extraction-docling-vs-llamaparse/sample_report.pdf differ