移除违禁词的PDF处理方案(Python实现)

如果你需要处理PDF文件中的违禁词(敏感词),同时实现扫码查看功能,以下是完整的Python解决方案:

方案一:PDF违禁词检测与替换 + 二维码生成

import re
from PyPDF2 import PdfReader, PdfWriter
from reportlab.pdfgen import canvas
import io
import qrcodedef remove_banned_words(pdf_path, output_path, banned_words, replacement="[已屏蔽]"):# 读取PDFreader = PdfReader(pdf_path)writer = PdfWriter()for page in reader.pages:# 提取文本text = page.extract_text()# 替换违禁词for word in banned_words:text = re.sub(re.escape(word), replacement, text, flags=re.IGNORECASE)# 这里简化处理,实际需要更复杂的PDF内容修改# 实际应用中可能需要使用更专业的PDF编辑库# 保留原始内容(简化示例)for page in reader.pages:writer.add_page(page)# 保存处理后的PDFwith open(output_path, "wb") as f:writer.write(f)def add_qr_code(pdf_path, output_path, url):# 生成二维码qr = qrcode.QRCode(version=1, box_size=10, border=4)qr.add_data(url)qr.make(fit=True)qr_img = qr.make_image(fill_color="black", back_color="white")# 创建包含二维码的新PDF页packet = io.BytesIO()can.drawImage(qr_img.get_image(), 50, 50, width=100, height=100)can.save()packet.seek(0)qr_pdf = PdfReader(packet)# 合并到原PDForiginal_pdf = PdfReader(pdf_path)output = PdfWriter()# 第一页添加二维码first_page = original_pdf.pages[0]first_page.merge_page(qr_pdf.pages[0])output.add_page(first_page)# 添加剩余页面for i in range(1, len(original_pdf.pages)):output.add_page(original_pdf.pages[i])# 保存结果with open(output_path, "wb") as output_file:output.write(output_file)# 使用示例
banned_words = ["敏感词1", "敏感词2", "违禁词"]
input_pdf = "original.pdf"
cleaned_pdf = "cleaned.pdf"
final_pdf = "final.pdf"# 第一步:移除违禁词
remove_banned_words(input_pdf, cleaned_pdf, banned_words)# 第二步:添加二维码
add_qr_code(cleaned_pdf, final_pdf, "https://yourdomain.com/view/123")

方案二:基于Web的高级处理方案

from flask import Flask, request, send_file, jsonify
import os
import qrcode
from PyPDF2 import PdfReader
import io
import reapp = Flask(__name__)
UPLOAD_FOLDER = 'uploads'
os.makedirs(UPLOAD_FOLDER, exist_ok=True)# 违禁词列表(可从数据库或文件加载)
BANNED_WORDS = ["敏感词", "违禁词", "政治术语"]def sanitize_text(text, replacement="***"):for word in BANNED_WORDS:text = re.sub(re.escape(word), replacement, text, flags=re.IGNORECASE)return text@app.route('/upload', methods=['POST'])
def upload_file():if 'file' not in request.files:return jsonify({"error": "No file uploaded"}), 400file = request.files['file']if file.filename == '':return jsonify({"error": "No selected file"}), 400if file and file.filename.endswith('.pdf'):# 保存原始文件filepath = os.path.join(UPLOAD_FOLDER, file.filename)file.save(filepath)# 处理违禁词(简化版,实际需要更复杂的PDF处理)reader = PdfReader(filepath)text_content = ""for page in reader.pages:text_content += page.extract_text() or ""sanitized_content = sanitize_text(text_content)# 生成唯一IDfile_id = os.path.splitext(file.filename)[0]# 生成访问URLpdf_url = f"{request.host_url}view/{file_id}"# 生成二维码qr = qrcode.QRCode(version=1, box_size=10, border=4)qr.add_data(pdf_url)qr.make(fit=True)qr_img = qr.make_image(fill_color="black", back_color="white")# 保存二维码qr_path = os.path.join(UPLOAD_FOLDER, f"{file_id}_qr.png")qr_img.save(qr_path)return jsonify({"file_id": file_id,"qr_code_url": f"{request.host_url}qr/{file_id}","pdf_url": pdf_url})return jsonify({"error": "Invalid file type"}), 400@app.route('/view/<file_id>')
def view_pdf(file_id):pdf_path = os.path.join(UPLOAD_FOLDER, f"{file_id}.pdf")if os.path.exists(pdf_path):return send_file(pdf_path, as_attachment=False)return "PDF not found", 404@app.route('/qr/<file_id>')
def get_qr(file_id):qr_path = os.path.join(UPLOAD_FOLDER, f"{file_id}_qr.png")if os.path.exists(qr_path):return send_file(qr_path, mimetype='image/png')return "QR code not found", 404if __name__ == '__main__':app.run(host='0.0.0.0', port=5000, debug=True)

高级PDF处理建议

对于更精确的PDF违禁词处理,建议:

  1. 使用专业PDF处理库:
from pdfminer.high_level import extract_text
from pdfminer.layout import LAParamsdef extract_pdf_text(pdf_path):laparams = LAParams()return extract_text(pdf_path, laparams=laparams)
  1. 考虑使用OCR处理扫描版PDF:
import pytesseract
from PIL import Image
import pdf2imagedef ocr_pdf(pdf_path):images = pdf2image.convert_from_path(pdf_path)text = ""for image in images:text += pytesseract.image_to_string(image)return text
  1. 对于精确的PDF内容修改,考虑商业库如:
  • PDFlib
  • iText
  • Apache PDFBox

部署注意事项

  1. 违禁词列表应定期更新
  2. 考虑使用Redis缓存处理结果
  3. 对于大规模处理,使用Celery进行异步任务处理
  4. 添加用户认证和权限控制

这个方案结合了违禁词处理和二维码生成功能,可以根据你的具体需求进行调整。