How to Build a Parsing Pipeline with Docling Parse for Layout-Aware Document Intelligence


def create_demo_image(path):
   img = Image.new("RGB", (320, 180), "white")
   draw = ImageDraw.Draw(img)
   draw.rectangle([20, 20, 300, 160], outline="black", width=3)
   draw.ellipse([55, 45, 145, 135], outline="black", width=4)
   draw.line([180, 140, 285, 45], fill="black", width=4)
   draw.text((45, 145), "Embedded bitmap image", fill="black")
   img.save(path)
create_demo_image(DEMO_IMAGE_PATH)
def build_pdf(pdf_path):
   c = canvas.Canvas(str(pdf_path), pagesize=A4)
   width, height = A4
   c.setFont("Helvetica-Bold", 20)
   c.drawString(60, height - 70, "Docling Parse Advanced PDF Parsing Tutorial")
   c.setFont("Helvetica", 11)
   intro = (
       "This generated document is designed for testing text extraction, coordinate parsing, "
       "line grouping, vector path detection, bitmap resources, and layout-aware reconstruction."
   )
   text_obj = c.beginText(60, height - 105)
   text_obj.setLeading(15)
   for line in textwrap.wrap(intro, width=90):
       text_obj.textLine(line)
   c.drawText(text_obj)
   c.setFont("Helvetica-Bold", 14)
   c.drawString(60, height - 170, "1. Two-column text region")
   left_para = (
       "The left column contains compact explanatory text. A parser should expose words, "
       "characters, and line-level cells along with coordinates. These coordinates allow us "
       "to reconstruct reading order and inspect the spatial structure of a page."
   )
   right_para = (
       "The right column contains a separate paragraph. In document AI pipelines, layout "
       "features are useful for retrieval, table extraction, chunking, and downstream RAG "
       "applications where page position can matter."
   )
   y_start = height - 200
   left_text = c.beginText(60, y_start)
   left_text.setFont("Helvetica", 10)
   left_text.setLeading(13)
   for line in textwrap.wrap(left_para, width=42):
       left_text.textLine(line)
   c.drawText(left_text)
   right_text = c.beginText(325, y_start)
   right_text.setFont("Helvetica", 10)
   right_text.setLeading(13)
   for line in textwrap.wrap(right_para, width=42):
       right_text.textLine(line)
   c.drawText(right_text)
   c.setStrokeColor(colors.darkblue)
   c.setLineWidth(2)
   c.rect(55, height - 315, 225, 130, stroke=1, fill=0)
   c.rect(320, height - 315, 225, 130, stroke=1, fill=0)
   c.setStrokeColor(colors.darkgreen)
   c.setLineWidth(3)
   c.circle(140, height - 390, 40, stroke=1, fill=0)
   c.line(220, height - 430, 310, height - 355)
   c.setFont("Helvetica-Bold", 14)
   c.setFillColor(colors.black)
   c.drawString(60, height - 470, "2. Simple table-like structure")
   data = [
       ["Section", "Signal", "Expected parser behavior"],
       ["Text", "Words and lines", "Return text cells with coordinates"],
       ["Vector", "Boxes and lines", "Expose page path/vector resources"],
       ["Bitmap", "Embedded image", "Expose or render image resources"],
   ]
   table = Table(data, colWidths=[100, 130, 260])
   table.setStyle(TableStyle([
       ("BACKGROUND", (0, 0), (-1, 0), colors.lightgrey),
       ("GRID", (0, 0), (-1, -1), 0.7, colors.black),
       ("FONTNAME", (0, 0), (-1, 0), "Helvetica-Bold"),
       ("FONTSIZE", (0, 0), (-1, -1), 9),
       ("VALIGN", (0, 0), (-1, -1), "MIDDLE"),
   ]))
   table.wrapOn(c, width, height)
   table.drawOn(c, 60, height - 590)
   c.setFont("Helvetica", 9)
   c.drawString(60, 55, "Page 1: generated programmatic PDF with text, table-like layout, and vector paths.")
   c.showPage()
   c.setFont("Helvetica-Bold", 18)
   c.drawString(60, height - 70, "Page 2: Bitmap, Dense Text, and Reading Order")
   c.setFont("Helvetica", 10)
   dense = (
       "This page includes an embedded bitmap image and several short blocks of text. "
       "We use it to test whether rendering works, whether the parser preserves page-level "
       "coordinates, and whether our own reconstruction logic can group words into lines."
   )
   y = height - 105
   for para_idx in range(4):
       tx = c.beginText(60, y)
       tx.setFont("Helvetica", 10)
       tx.setLeading(13)
       for line in textwrap.wrap(f"Block {para_idx + 1}: {dense}", width=92):
           tx.textLine(line)
       c.drawText(tx)
       y -= 70
   c.drawImage(str(DEMO_IMAGE_PATH), 110, height - 510, width=320, height=180, preserveAspectRatio=True)
   c.setStrokeColor(colors.red)
   c.setLineWidth(2)
   c.roundRect(95, height - 525, 350, 210, 10, stroke=1, fill=0)
   c.setFillColor(colors.black)
   c.setFont("Helvetica-Bold", 12)
   c.drawString(60, height - 570, "Coordinate-aware extraction lets us keep page, text, and position together.")
   c.setFont("Helvetica", 9)
   c.drawString(60, 55, "Page 2: embedded bitmap image and multiple text blocks.")
   c.save()
build_pdf(PDF_PATH)
print("Created PDF:", PDF_PATH)



Source link

Leave a Reply

Your email address will not be published. Required fields are marked *