PDF - Portable Documents
Extract text, tables, metadata, merge & annotate PDFs with comprehensive PDF processing capabilities.
npx degit LangbaseInc/agent-skills/document-skills/pdf my-pdf-skill
Work with PDF files programmatically for extraction, manipulation, generation, and analysis.
Text Extraction
- Extract all text
- Extract by page
- Preserve layout
- Handle multi-column
- OCR for scanned PDFs
Table Extraction
- Detect tables automatically
- Extract table data
- Convert to CSV/Excel
- Preserve structure
Metadata
- Read PDF properties
- Author, title, subject
- Creation/modification dates
- Custom metadata
Manipulation
- Merge multiple PDFs
- Split PDFs
- Rotate pages
- Delete/reorder pages
- Add watermarks
Generation
- Create PDFs from scratch
- HTML to PDF
- Images to PDF
- Add text and images
- Create forms
Python
# PyPDF2 - Basic operations
from PyPDF2 import PdfReader, PdfWriter
reader = PdfReader('input.pdf')
writer = PdfWriter()
# Extract text
for page in reader.pages:
print(page.extract_text())
# Merge PDFs
for pdf in ['file1.pdf', 'file2.pdf']:
reader = PdfReader(pdf)
for page in reader.pages:
writer.add_page(page)
writer.write('merged.pdf')
# pdfplumber - Advanced extraction
import pdfplumber
with pdfplumber.open('document.pdf') as pdf:
# Extract text
text = pdf.pages[0].extract_text()
# Extract tables
tables = pdf.pages[0].extract_tables()
for table in tables:
print(table)
# ReportLab - PDF generation
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
c = canvas.Canvas("output.pdf", pagesize=letter)
c.drawString(100, 750, "Hello World")
c.save()
Node.js
const PDFDocument = require('pdfkit');
const fs = require('fs');
// Create PDF
const doc = new PDFDocument();
doc.pipe(fs.createWriteStream('output.pdf'));
doc.fontSize(25).text('Hello World', 100, 100);
doc.end();
import pdfplumber
def extract_all_text(pdf_path):
with pdfplumber.open(pdf_path) as pdf:
text = ''
for page in pdf.pages:
text += page.extract_text() + '\n'
return text
# With layout preservation
def extract_with_layout(pdf_path):
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
text = page.extract_text(layout=True)
print(text)
import pdfplumber
import pandas as pd
def extract_tables(pdf_path):
tables = []
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
page_tables = page.extract_tables()
for table in page_tables:
df = pd.DataFrame(table[1:], columns=table[0])
tables.append(df)
return tables
# Save to Excel
tables = extract_tables('document.pdf')
with pd.ExcelWriter('output.xlsx') as writer:
for i, df in enumerate(tables):
df.to_excel(writer, sheet_name=f'Table_{i+1}')
from PyPDF2 import PdfWriter, PdfReader
def merge_pdfs(pdf_list, output):
writer = PdfWriter()
for pdf in pdf_list:
reader = PdfReader(pdf)
for page in reader.pages:
writer.add_page(page)
with open(output, 'wb') as f:
writer.write(f)
# Usage
merge_pdfs(['file1.pdf', 'file2.pdf'], 'merged.pdf')
from PyPDF2 import PdfReader, PdfWriter
def split_pdf(input_pdf, output_prefix):
reader = PdfReader(input_pdf)
for i, page in enumerate(reader.pages):
writer = PdfWriter()
writer.add_page(page)
with open(f'{output_prefix}_{i+1}.pdf', 'wb') as f:
writer.write(f)
# Split into individual pages
split_pdf('document.pdf', 'page')
from PyPDF2 import PdfReader, PdfWriter
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
def add_watermark(input_pdf, output_pdf, watermark_text):
# Create watermark
c = canvas.Canvas('watermark.pdf', pagesize=letter)
c.setFillColorRGB(0.5, 0.5, 0.5, alpha=0.3)
c.setFont('Helvetica', 60)
c.drawString(100, 400, watermark_text)
c.save()
# Apply to PDF
reader = PdfReader(input_pdf)
watermark = PdfReader('watermark.pdf').pages[0]
writer = PdfWriter()
for page in reader.pages:
page.merge_page(watermark)
writer.add_page(page)
with open(output_pdf, 'wb') as f:
writer.write(f)
from pdf2image import convert_from_path
import pytesseract
def ocr_pdf(pdf_path):
# Convert PDF to images
images = convert_from_path(pdf_path)
text = ''
for image in images:
# Perform OCR
text += pytesseract.image_to_string(image) + '\n'
return text
from PyPDF2 import PdfReader
def get_metadata(pdf_path):
reader = PdfReader(pdf_path)
metadata = reader.metadata
return {
'title': metadata.title,
'author': metadata.author,
'subject': metadata.subject,
'creator': metadata.creator,
'producer': metadata.producer,
'pages': len(reader.pages),
}
import pdfkit
# Simple conversion
pdfkit.from_url('https://example.com', 'output.pdf')
pdfkit.from_file('input.html', 'output.pdf')
pdfkit.from_string('<h1>Hello</h1>', 'output.pdf')
# With options
options = {
'page-size': 'A4',
'margin-top': '0.75in',
'margin-right': '0.75in',
'margin-bottom': '0.75in',
'margin-left': '0.75in',
}
pdfkit.from_url('https://example.com', 'output.pdf', options=options)
- Document archival
- Invoice processing
- Contract analysis
- Form data extraction
- Report generation
- Document automation
- Data mining
- Digital signatures
- Validate PDFs before processing
- Handle encrypted PDFs properly
- Use OCR for scanned documents
- Preserve original files
- Optimize for large files
- Handle exceptions gracefully
- Test across PDF versions
- Consider memory usage