Using Lift to Turn Research PDFs into Structured JSON with Controlled, Schema-Guided Field-Level Evaluation


def render_pdf(d, path):
   """Draw a realistic 3-page report. Page breaks are forced so the headline metric on
   page 1 (abstract) is physically separated from the results table on page 3."""
   from reportlab.lib.pagesizes import LETTER
   from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
   from reportlab.lib.units import inch
   from reportlab.lib import colors
   from reportlab.platypus import (SimpleDocTemplate, Paragraph, Spacer,
                                   Table, TableStyle, PageBreak)
   ss = getSampleStyleSheet()
   H1   = ParagraphStyle("H1", parent=ss["Title"], fontSize=16, leading=20, spaceAfter=6)
   AUTH = ParagraphStyle("AUTH", parent=ss["Normal"], fontSize=9.5, textColor=colors.grey, spaceAfter=10)
   H2   = ParagraphStyle("H2", parent=ss["Heading2"], fontSize=12, spaceBefore=8, spaceAfter=4)
   BODY = ParagraphStyle("BODY", parent=ss["Normal"], fontSize=10, leading=14, spaceAfter=6)
   sota_phrase = (f"surpassing the previous best of {d['prior_best']}"
                  if d["beats_sota"] else
                  f"approaching but not exceeding the previous best of {d['prior_best']}")
   authors_line = ", ".join(f"{n} ({a})" for (n, a) in d["authors"])
   story = []
   story += [Paragraph(d["title"], H1), Paragraph(authors_line, AUTH), Paragraph("Abstract", H2)]
   story += [Paragraph(
       f"We introduce {d['method']}, a model for {d['task']}. On the {d['primary_benchmark']} "
       f"benchmark, {d['method']} attains {d['test_acc']} {d['metric_name']} on the held-out "
       f"test set, {sota_phrase}. Our {d['params_m']}M-parameter model is evaluated across "
       f"{len(d['datasets'])} datasets ({', '.join(d['datasets'])}). "
       f"Extensive ablations confirm the contribution of each component.", BODY)]
   story += [Paragraph("Keywords", H2),
             Paragraph(f"{d['task']}; representation learning; {d['primary_benchmark']}", BODY),
             PageBreak()]
   story += [Paragraph("1  Method and Training Details", H2)]
   story += [Paragraph(
       f"{d['method']} is trained end-to-end with the {d['optimizer']} optimizer. "
       f"We tune on a validation split and report final numbers on the test split. "
       f"The full training configuration is summarized in Table 1.", BODY)]
   hp = [["Hyperparameter", "Value"],
         ["Optimizer", d["optimizer"]],
         ["Learning rate", str(d["lr"])],
         ["Batch size", str(d["batch"])],
         ["Epochs", str(d["epochs"])],
         ["Parameters", f"{d['params_m']}M"]]
   t1 = Table(hp, colWidths=[2.4 * inch, 2.0 * inch])
   t1.setStyle(TableStyle([
       ("BACKGROUND", (0, 0), (-1, 0), colors.HexColor("#2b3a67")),
       ("TEXTCOLOR", (0, 0), (-1, 0), colors.white),
       ("FONTSIZE", (0, 0), (-1, -1), 9.5),
       ("GRID", (0, 0), (-1, -1), 0.4, colors.grey),
       ("ROWBACKGROUNDS", (0, 1), (-1, -1), [colors.white, colors.HexColor("#eef1f8")]),
       ("LEFTPADDING", (0, 0), (-1, -1), 8), ("TOPPADDING", (0, 0), (-1, -1), 4),
       ("BOTTOMPADDING", (0, 0), (-1, -1), 4)]))
   story += [Spacer(1, 4), t1, Spacer(1, 6),
             Paragraph("Table 1. Training configuration.", BODY),
             Paragraph("2  Datasets", H2),
             Paragraph(
                 f"We evaluate on {', '.join(d['datasets'])}. {d['primary_benchmark']} is our "
                 f"primary benchmark; the remaining datasets are used for generalization "
                 f"studies.", BODY),
             PageBreak()]
   story += [Paragraph("3  Results", H2)]
   res = [["Method", f"Val. {d['metric_name']}", f"Test {d['metric_name']}"],
          [f"{d['baseline_name']} (baseline)", str(d["baseline_val"]), str(d["baseline_test"])],
          [f"{d['method']} (ours)", str(d["val_acc"]), str(d["test_acc"])]]
   t2 = Table(res, colWidths=[2.6 * inch, 1.7 * inch, 1.7 * inch])
   t2.setStyle(TableStyle([
       ("BACKGROUND", (0, 0), (-1, 0), colors.HexColor("#7a2e2e")),
       ("TEXTCOLOR", (0, 0), (-1, 0), colors.white),
       ("FONTSIZE", (0, 0), (-1, -1), 9.5),
       ("GRID", (0, 0), (-1, -1), 0.4, colors.grey),
       ("FONTNAME", (0, 2), (-1, 2), "Helvetica-Bold"),
       ("ROWBACKGROUNDS", (0, 1), (-1, -1), [colors.white, colors.HexColor("#f7eeee")]),
       ("LEFTPADDING", (0, 0), (-1, -1), 8), ("TOPPADDING", (0, 0), (-1, -1), 4),
       ("BOTTOMPADDING", (0, 0), (-1, -1), 4)]))
   story += [Spacer(1, 4), t2, Spacer(1, 6),
             Paragraph(f"Table 2. Results on {d['primary_benchmark']}. "
                       f"Best test result in bold.", BODY),
             Paragraph("4  Limitations", H2)]
   for lim in d["limitations"]:
       story += [Paragraph("• " + lim, BODY)]
   story += [Paragraph("5  Funding and Code Availability", H2),
             Paragraph(d["funding_note"], BODY)]
   SimpleDocTemplate(path, pagesize=LETTER,
                     topMargin=0.8 * inch, bottomMargin=0.8 * inch,
                     leftMargin=0.9 * inch, rightMargin=0.9 * inch).build(story)
print("STEP 3/7 · Generating synthetic report PDFs…")
CORPUS = []
for i, d in enumerate(DOCS):
   path = f"/content/report_{i}.pdf" if os.path.isdir("/content") else f"report_{i}.pdf"
   render_pdf(d, path)
   CORPUS.append((d, ground_truth(d), path))
   print(f"     ✓ {os.path.basename(path)}  —  {d['method']}")
print()
if SHOW_FIRST_PAGE:
   try:
       import pypdfium2 as pdfium, matplotlib.pyplot as plt
       pg  = pdfium.PdfDocument(CORPUS[0][2])[0]
       img = pg.render(scale=2.0).to_pil()
       plt.figure(figsize=(6.4, 8.3)); plt.imshow(img); plt.axis("off")
       plt.title("What lift reads — page 1 of report_0.pdf", fontsize=10); plt.show()
   except Exception as e:
       print("     (page preview skipped:", e, ")\n")



Source link

  • Related Posts

    Anthropic Redeploys Claude Fable 5 on July 1 After US Export Controls Lift, Adds New Cybersecurity Classifier

    Anthropic is redeploying Claude Fable 5, its most capable generally available model. On June 30, it announced that US export controls had lifted. The controls had covered Claude Fable 5…

    2026 BAIR Graduate Showcase – The Berkeley Artificial Intelligence Research Blog

    Congratulations to the Berkeley Artificial Intelligence Research (BAIR) Lab class of 2026! This year, BAIR celebrates another remarkable group of Ph.D. graduates whose curiosity, creativity, and perseverance have pushed the…

    Leave a Reply

    Your email address will not be published. Required fields are marked *