PDF - Portable Documents

Extract text, tables, metadata, merge & annotate PDFs with comprehensive PDF processing capabilities.


npx degit LangbaseInc/agent-skills/document-skills/pdf my-pdf-skill

Work with PDF files programmatically for extraction, manipulation, generation, and analysis.


Text Extraction

  • Extract all text
  • Extract by page
  • Preserve layout
  • Handle multi-column
  • OCR for scanned PDFs

Table Extraction

  • Detect tables automatically
  • Extract table data
  • Convert to CSV/Excel
  • Preserve structure

Metadata

  • Read PDF properties
  • Author, title, subject
  • Creation/modification dates
  • Custom metadata

Manipulation

  • Merge multiple PDFs
  • Split PDFs
  • Rotate pages
  • Delete/reorder pages
  • Add watermarks

Generation

  • Create PDFs from scratch
  • HTML to PDF
  • Images to PDF
  • Add text and images
  • Create forms

Python

# PyPDF2 - Basic operations from PyPDF2 import PdfReader, PdfWriter reader = PdfReader('input.pdf') writer = PdfWriter() # Extract text for page in reader.pages: print(page.extract_text()) # Merge PDFs for pdf in ['file1.pdf', 'file2.pdf']: reader = PdfReader(pdf) for page in reader.pages: writer.add_page(page) writer.write('merged.pdf')
# pdfplumber - Advanced extraction import pdfplumber with pdfplumber.open('document.pdf') as pdf: # Extract text text = pdf.pages[0].extract_text() # Extract tables tables = pdf.pages[0].extract_tables() for table in tables: print(table)
# ReportLab - PDF generation from reportlab.pdfgen import canvas from reportlab.lib.pagesizes import letter c = canvas.Canvas("output.pdf", pagesize=letter) c.drawString(100, 750, "Hello World") c.save()

Node.js

const PDFDocument = require('pdfkit'); const fs = require('fs'); // Create PDF const doc = new PDFDocument(); doc.pipe(fs.createWriteStream('output.pdf')); doc.fontSize(25).text('Hello World', 100, 100); doc.end();

import pdfplumber def extract_all_text(pdf_path): with pdfplumber.open(pdf_path) as pdf: text = '' for page in pdf.pages: text += page.extract_text() + '\n' return text # With layout preservation def extract_with_layout(pdf_path): with pdfplumber.open(pdf_path) as pdf: for page in pdf.pages: text = page.extract_text(layout=True) print(text)

import pdfplumber import pandas as pd def extract_tables(pdf_path): tables = [] with pdfplumber.open(pdf_path) as pdf: for page in pdf.pages: page_tables = page.extract_tables() for table in page_tables: df = pd.DataFrame(table[1:], columns=table[0]) tables.append(df) return tables # Save to Excel tables = extract_tables('document.pdf') with pd.ExcelWriter('output.xlsx') as writer: for i, df in enumerate(tables): df.to_excel(writer, sheet_name=f'Table_{i+1}')

from PyPDF2 import PdfWriter, PdfReader def merge_pdfs(pdf_list, output): writer = PdfWriter() for pdf in pdf_list: reader = PdfReader(pdf) for page in reader.pages: writer.add_page(page) with open(output, 'wb') as f: writer.write(f) # Usage merge_pdfs(['file1.pdf', 'file2.pdf'], 'merged.pdf')

from PyPDF2 import PdfReader, PdfWriter def split_pdf(input_pdf, output_prefix): reader = PdfReader(input_pdf) for i, page in enumerate(reader.pages): writer = PdfWriter() writer.add_page(page) with open(f'{output_prefix}_{i+1}.pdf', 'wb') as f: writer.write(f) # Split into individual pages split_pdf('document.pdf', 'page')

from PyPDF2 import PdfReader, PdfWriter from reportlab.pdfgen import canvas from reportlab.lib.pagesizes import letter def add_watermark(input_pdf, output_pdf, watermark_text): # Create watermark c = canvas.Canvas('watermark.pdf', pagesize=letter) c.setFillColorRGB(0.5, 0.5, 0.5, alpha=0.3) c.setFont('Helvetica', 60) c.drawString(100, 400, watermark_text) c.save() # Apply to PDF reader = PdfReader(input_pdf) watermark = PdfReader('watermark.pdf').pages[0] writer = PdfWriter() for page in reader.pages: page.merge_page(watermark) writer.add_page(page) with open(output_pdf, 'wb') as f: writer.write(f)

from pdf2image import convert_from_path import pytesseract def ocr_pdf(pdf_path): # Convert PDF to images images = convert_from_path(pdf_path) text = '' for image in images: # Perform OCR text += pytesseract.image_to_string(image) + '\n' return text

from PyPDF2 import PdfReader def get_metadata(pdf_path): reader = PdfReader(pdf_path) metadata = reader.metadata return { 'title': metadata.title, 'author': metadata.author, 'subject': metadata.subject, 'creator': metadata.creator, 'producer': metadata.producer, 'pages': len(reader.pages), }

import pdfkit # Simple conversion pdfkit.from_url('https://example.com', 'output.pdf') pdfkit.from_file('input.html', 'output.pdf') pdfkit.from_string('<h1>Hello</h1>', 'output.pdf') # With options options = { 'page-size': 'A4', 'margin-top': '0.75in', 'margin-right': '0.75in', 'margin-bottom': '0.75in', 'margin-left': '0.75in', } pdfkit.from_url('https://example.com', 'output.pdf', options=options)

  • Document archival
  • Invoice processing
  • Contract analysis
  • Form data extraction
  • Report generation
  • Document automation
  • Data mining
  • Digital signatures

  • Validate PDFs before processing
  • Handle encrypted PDFs properly
  • Use OCR for scanned documents
  • Preserve original files
  • Optimize for large files
  • Handle exceptions gracefully
  • Test across PDF versions
  • Consider memory usage