-
Notifications
You must be signed in to change notification settings - Fork 5.9k
Open
Labels
Description
bug描述 Describe the Bug
FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
RUN pip install paddlepaddle-gpu==3.3.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/
RUN pip install paddleocr
代码:
#!/usr/bin/env python3
"""
PaddleOCR 修复版 - 适配最新 API
"""
import os
import sys
import json
import time
import cv2
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
# 设置环境变量
os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'
# 尝试导入 PaddleOCR
try:
from paddleocr import PaddleOCR
print("✅ PaddleOCR 导入成功")
print(f"PaddleOCR 版本信息: 使用最新 API")
except ImportError as e:
print(f"❌ 导入 PaddleOCR 失败: {e}")
sys.exit(1)
class FixedPaddleOCRAnalyzer:
"""修复后的 PaddleOCR 分析器"""
def __init__(self, use_gpu=False, lang='ch', ocr_version='PP-OCRv4'):
"""
初始化 PaddleOCR
参数:
use_gpu: 是否使用 GPU
lang: 语言 ('ch' 或 'en')
ocr_version: OCR 版本 ('PP-OCRv3', 'PP-OCRv4', 'PP-OCRv5')
"""
print(f"🚀 初始化 PaddleOCR (版本: {ocr_version}, 语言: {lang})...")
try:
# 使用最新的 API 参数
self.ocr = PaddleOCR(
# 文档方向检测
use_doc_orientation_classify=False,
# 文档矫正
use_doc_unwarping=False,
# 文本检测
text_detection_model_name=None, # 自动选择
#text_det_limit_side_len=960, # 图片长边限制
text_det_limit_type='max', # 限制方式
text_det_thresh=0.3, # 检测阈值
text_det_box_thresh=0.6, # 文本框阈值
text_det_unclip_ratio=1.5, # 文本框扩展比例
# 文本识别
text_recognition_model_name=None, # 自动选择
text_rec_score_thresh=0.5, # 识别置信度阈值
text_rec_input_shape='3, 48, 320', # 识别输入形状
# 文本方向分类
#use_textline_orientation=True, # 启用文本方向分类
textline_orientation_model_name=None, # 自动选择
# 批次大小
textline_orientation_batch_size=1,
#text_recognition_batch_size=1,
# 其他参数
lang=lang, # 语言
ocr_version=ocr_version, # OCR 版本
# show_log=False, # 不显示详细日志
#use_gpu=use_gpu, # 是否使用 GPU
# 兼容性参数(会映射到新参数)
#det_limit_side_len=960, # 映射到 text_det_limit_side_len
rec_batch_num=1, # 映射到 text_recognition_batch_size
use_angle_cls=True, # 映射到 use_textline_orientation
)
print(f"✅ PaddleOCR 初始化成功")
except Exception as e:
print(f"❌ 初始化失败: {e}")
print("尝试使用简化初始化...")
# 简化初始化
self.ocr = PaddleOCR(
use_angle_cls=True,
lang=lang,
#use_gpu=use_gpu,
#show_log=False
)
print("✅ 使用简化初始化完成")
def analyze_image(self, image_path):
"""
分析图片
返回:
dict: 包含图片和分析结果
"""
print(f"\n📄 分析图片: {os.path.basename(image_path)}")
# 检查文件
if not os.path.exists(image_path):
raise FileNotFoundError(f"图片不存在: {image_path}")
# 读取图片
img = cv2.imread(image_path)
if img is None:
raise ValueError(f"无法读取图片: {image_path}")
img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
height, width = img.shape[:2]
print(f"📏 图片尺寸: {width} x {height}")
# 执行 OCR
start_time = time.time()
try:
result = self.ocr.predict( image_path )
except Exception as e:
print(f"❌ OCR 分析失败: {e}")
# 尝试直接传递图片
result = self.ocr.ocr(img )
analysis_time = time.time() - start_time
print(f"⏱️ 分析耗时: {analysis_time:.2f}秒")
# 解析结果
text_blocks = self._parse_ocr_result(result, width, height)
return {
'image': img_rgb,
'image_shape': (height, width),
'analysis_result': result,
'text_blocks': text_blocks,
'analysis_time': analysis_time,
'num_blocks': len(text_blocks)
}
def _parse_ocr_result(self, result, img_width, img_height):
"""
解析 OCR 结果
注意: PaddleOCR 的结果结构:
[
[ # 第一张图片
[ # 第一个文本框
[[x1, y1], [x2, y2], [x3, y3], [x4, y4]], # 文本框坐标
[text, confidence] # 文本和置信度
],
...
]
]
"""
text_blocks = []
if not result or len(result) == 0:
return text_blocks
# 处理第一张图片的结果
for line in result[0]:
if len(line) >= 2:
bbox = line[0] # 文本框坐标
text_info = line[1] # 文本信息
if len(text_info) >= 2:
text = str(text_info[0])
confidence = float(text_info[1])
# 转换为 numpy 数组方便计算
points = np.array(bbox, dtype=np.float32)
# 计算边界框
x_coords = points[:, 0]
y_coords = points[:, 1]
x1, y1 = np.min(x_coords), np.min(y_coords)
x2, y2 = np.max(x_coords), np.max(y_coords)
# 计算中心点
center_x = np.mean(x_coords)
center_y = np.mean(y_coords)
# 计算面积
area = (x2 - x1) * (y2 - y1)
area_percentage = (area / (img_width * img_height)) * 100
block = {
'bbox': bbox,
'bbox_rect': [float(x1), float(y1), float(x2), float(y2)],
'center': [float(center_x), float(center_y)],
'text': text,
'confidence': confidence,
'area': float(area),
'area_percentage': float(area_percentage),
'width': float(x2 - x1),
'height': float(y2 - y1)
}
text_blocks.append(block)
# 按置信度排序
text_blocks.sort(key=lambda x: x['confidence'], reverse=True)
return text_blocks
def visualize_results(self, result, output_path, dpi=300):
"""可视化结果"""
print(f"\n🎨 生成可视化结果...")
img = result['image']
text_blocks = result['text_blocks']
# 创建图形
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 10))
# 1. 原始图片
ax1.imshow(img)
ax1.set_title('原始图片', fontsize=16, fontweight='bold')
ax1.axis('off')
# 2. 分析结果
result_img = img.copy()
# 使用颜色映射
colors = plt.cm.rainbow(np.linspace(0, 1, len(text_blocks)))
for idx, block in enumerate(text_blocks):
color = colors[idx % len(colors)]
color_bgr = (int(color[2] * 255), int(color[1] * 255), int(color[0] * 255))
# 绘制多边形
points = np.array(block['bbox'], dtype=np.int32)
cv2.polylines(result_img, [points], isClosed=True,
color=color_bgr, thickness=3)
# 绘制中心点
center_x, center_y = int(block['center'][0]), int(block['center'][1])
cv2.circle(result_img, (center_x, center_y), 5, color_bgr, -1)
# 添加编号和置信度
text = f"{idx + 1}:{block['confidence']:.2f}"
cv2.putText(result_img, text, (center_x, center_y - 10),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, color_bgr, 2)
# 如果是高置信度文本,显示内容
if block['confidence'] > 0.8 and len(block['text']) < 20:
cv2.putText(result_img, block['text'][:15],
(center_x, center_y + 20),
cv2.FONT_HERSHEY_SIMPLEX, 0.4, color_bgr, 1)
ax2.imshow(result_img)
ax2.set_title(f'OCR 结果 (检测到 {len(text_blocks)} 个文本框)',
fontsize=16, fontweight='bold')
ax2.axis('off')
# 添加统计信息
stats_text = (f"分析统计:\n"
f"文本框数: {len(text_blocks)}\n"
f"分析时间: {result['analysis_time']:.2f}秒\n"
f"平均置信度: {np.mean([b['confidence'] for b in text_blocks]):.3f}")
fig.text(0.5, 0.02, stats_text, ha='center', fontsize=12,
bbox=dict(boxstyle="round,pad=0.3", facecolor="lightgray", alpha=0.8))
plt.suptitle('PaddleOCR 文本检测与识别', fontsize=20, fontweight='bold', y=0.98)
plt.tight_layout(rect=[0, 0.05, 1, 0.95])
# 保存
plt.savefig(output_path, dpi=dpi, bbox_inches='tight')
plt.close()
print(f"✅ 可视化结果已保存: {output_path}")
return output_path
def save_text_results(self, result, output_path):
"""保存文本结果"""
text_blocks = result['text_blocks']
with open(output_path, 'w', encoding='utf-8') as f:
f.write("=" * 60 + "\n")
f.write("PADDLEOCR 文本检测与识别结果\n")
f.write("=" * 60 + "\n\n")
f.write(f"分析时间: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
f.write(f"图片尺寸: {result['image_shape'][1]} x {result['image_shape'][0]}\n")
f.write(f"分析耗时: {result['analysis_time']:.2f}秒\n")
f.write(f"检测到文本框数: {len(text_blocks)}\n\n")
# 统计置信度分布
confidences = [b['confidence'] for b in text_blocks]
if confidences:
f.write(f"置信度统计:\n")
f.write(f" 最高: {max(confidences):.4f}\n")
f.write(f" 最低: {min(confidences):.4f}\n")
f.write(f" 平均: {np.mean(confidences):.4f}\n")
f.write(f" 中位数: {np.median(confidences):.4f}\n\n")
f.write("-" * 60 + "\n")
f.write("📋 详细结果:\n")
f.write("-" * 60 + "\n\n")
for idx, block in enumerate(text_blocks, 1):
f.write(f"文本框 {idx}:\n")
f.write(f" 文本: {block['text']}\n")
f.write(f" 置信度: {block['confidence']:.4f}\n")
f.write(f" 位置: {block['bbox_rect']}\n")
f.write(f" 尺寸: {block['width']:.1f} x {block['height']:.1f}\n")
f.write(f" 面积占比: {block['area_percentage']:.2f}%\n")
f.write("-" * 40 + "\n")
# 所有文本合并
f.write("\n" + "=" * 60 + "\n")
f.write("📄 完整文本内容:\n")
f.write("=" * 60 + "\n\n")
for idx, block in enumerate(text_blocks, 1):
f.write(f"[{idx}] {block['text']}\n")
print(f"✅ 文本结果已保存: {output_path}")
return output_path
def save_json_results(self, result, output_path):
"""保存 JSON 结果"""
data = {
'metadata': {
'filename': os.path.basename(output_path).replace('.json', ''),
'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
'tool': 'PaddleOCR',
'api_version': 'latest'
},
'image_info': {
'width': result['image_shape'][1],
'height': result['image_shape'][0],
'channels': 3
},
'analysis_info': {
'analysis_time': result['analysis_time'],
'num_blocks': len(result['text_blocks']),
'avg_confidence': float(np.mean([b['confidence'] for b in result['text_blocks']]))
},
'results': result['text_blocks']
}
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
print(f"✅ JSON 结果已保存: {output_path}")
return output_path
def export_excel(self, result, output_path):
"""导出 Excel 结果"""
try:
import pandas as pd
data = []
for idx, block in enumerate(result['text_blocks'], 1):
data.append({
'序号': idx,
'文本内容': block['text'],
'置信度': block['confidence'],
'左上角X': block['bbox_rect'][0],
'左上角Y': block['bbox_rect'][1],
'右下角X': block['bbox_rect'][2],
'右下角Y': block['bbox_rect'][3],
'宽度': block['width'],
'高度': block['height'],
'面积占比%': block['area_percentage']
})
df = pd.DataFrame(data)
df.to_excel(output_path, index=False)
print(f"✅ Excel 结果已保存: {output_path}")
return output_path
except ImportError:
print("⚠️ 未安装 pandas,跳过 Excel 导出")
return None
def process_image(self, image_path, output_dir='./output',
visualize=True, save_json=True, save_txt=True, save_excel=False):
"""处理单张图片"""
# 创建输出目录
os.makedirs(output_dir, exist_ok=True)
# 获取文件名
filename = os.path.splitext(os.path.basename(image_path))[0]
# 分析图片
try:
result = self.analyze_image(image_path)
except Exception as e:
print(f"❌ 图片分析失败: {e}")
return None
output_files = {}
# 可视化结果
if visualize:
viz_path = os.path.join(output_dir, f"{filename}_result.jpg")
output_files['visualization'] = self.visualize_results(result, viz_path)
# 文本结果
if save_txt:
txt_path = os.path.join(output_dir, f"{filename}_result.txt")
output_files['text'] = self.save_text_results(result, txt_path)
# JSON 结果
if save_json:
json_path = os.path.join(output_dir, f"{filename}_result.json")
output_files['json'] = self.save_json_results(result, json_path)
# Excel 结果
if save_excel:
excel_path = os.path.join(output_dir, f"{filename}_result.xlsx")
excel_file = self.export_excel(result, excel_path)
if excel_file:
output_files['excel'] = excel_file
# 打印摘要
self._print_summary(result, output_files)
return {
'analysis_result': result,
'output_files': output_files
}
def _print_summary(self, result, output_files):
"""打印处理摘要"""
print("\n" + "=" * 60)
print("📄 处理摘要")
print("=" * 60)
print(f"图片尺寸: {result['image_shape'][1]} x {result['image_shape'][0]}")
print(f"检测到文本框数: {len(result['text_blocks'])}")
print(f"分析耗时: {result['analysis_time']:.2f}秒")
if result['text_blocks']:
confidences = [b['confidence'] for b in result['text_blocks']]
print(f"置信度统计:")
print(f" 最高: {max(confidences):.4f}")
print(f" 最低: {min(confidences):.4f}")
print(f" 平均: {np.mean(confidences):.4f}")
print("\n生成的文件:")
for file_type, file_path in output_files.items():
print(f" {file_type}: {os.path.basename(file_path)}")
print("=" * 60)
def main():
"""主函数"""
import argparse
parser = argparse.ArgumentParser(description='PaddleOCR 文本检测与识别工具')
parser.add_argument('--input', '-i', type=str, required=True,
help='输入图片路径')
parser.add_argument('--output', '-o', type=str, default='./output',
help='输出目录')
parser.add_argument('--gpu', action='store_true',
help='使用 GPU 加速')
parser.add_argument('--lang', type=str, default='ch',
help='语言 (ch: 中文, en: 英文)')
parser.add_argument('--version', type=str, default='PP-OCRv4',
help='OCR 版本 (PP-OCRv3, PP-OCRv4, PP-OCRv5)')
args = parser.parse_args()
# 检查输入文件
if not os.path.exists(args.input):
print(f"❌ 输入文件不存在: {args.input}")
return 1
print("=" * 60)
print("PaddleOCR 文本检测与识别工具")
print("=" * 60)
print(f"输入文件: {args.input}")
print(f"输出目录: {args.output}")
print(f"使用 GPU: {args.gpu}")
print(f"语言: {args.lang}")
print(f"OCR 版本: {args.version}")
print("=" * 60)
try:
# 初始化分析器
analyzer = FixedPaddleOCRAnalyzer(
# use_gpu=args.gpu,
lang=args.lang,
ocr_version=args.version
)
# 处理图片
result = analyzer.process_image(
image_path=args.input,
output_dir=args.output,
visualize=True,
save_json=True,
save_txt=True,
save_excel=True
)
if result:
print("\n✅ 处理完成!")
return 0
else:
print("\n❌ 处理失败!")
return 1
except Exception as e:
print(f"\n❌ 处理过程中发生错误: {e}")
import traceback
traceback.print_exc()
return 1
# 直接运行示例
if __name__ == "__main__":
if len(sys.argv) == 1:
# 如果没有参数,显示帮助
print("使用示例:")
print(" python fixed_paddleocr.py --input ./input/document.jpg --output ./output")
print("\n或直接运行测试:")
# 测试代码
test_image = "./input/document.jpg"
if not os.path.exists(test_image):
print(f"\n⚠️ 请先创建测试图片目录: mkdir input")
print(f"并将图片放入: {test_image}")
print("\n正在创建示例图片...")
os.makedirs("./input", exist_ok=True)
# 创建示例图片
img = np.ones((600, 800, 3), dtype=np.uint8) * 255
# 添加文本
cv2.putText(img, "文档标题", (200, 100),
cv2.FONT_HERSHEY_SIMPLEX, 2, (0, 0, 0), 3)
cv2.putText(img, "这是文档的第一段内容。", (50, 200),
cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 0), 2)
cv2.putText(img, "这是第二段内容,包含一些重要的信息。", (50, 250),
cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 0), 2)
cv2.putText(img, "表格数据:", (50, 350),
cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 0, 0), 2)
cv2.putText(img, "项目A | 100元", (100, 400),
cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 0), 2)
cv2.putText(img, "项目B | 200元", (100, 450),
cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 0), 2)
cv2.putText(img, "文档结束", (300, 550),
cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 0), 2)
cv2.imwrite(test_image, img)
print(f"✅ 已创建示例图片: {test_image}")
# 运行分析
analyzer = FixedPaddleOCRAnalyzer(use_gpu=False, lang='ch')
analyzer.process_image(test_image, './output')
else:
sys.exit(main())
docker build -t vl .
docker run --rm -v C:/Users/xx/Desktop/PaddleOCR-VL/images:/app/images -v C:/Users/xx/Desktop/PaddleOCR-VL/models:/root/.paddlex/ --gpus all vl
输出:
Connectivity check to the model hoster has been skipped because PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK is enabled.
Creating model: ('PP-LCNet_x1_0_textline_ori', None)
Model files already exist. Using cached files. To redownload, please delete the directory manually: /root/.paddlex/official_models/PP-LCNet_x1_0_textline_ori.
Creating model: ('PP-OCRv4_mobile_det', None)
Model files already exist. Using cached files. To redownload, please delete the directory manually: /root/.paddlex/official_models/PP-OCRv4_mobile_det.
Creating model: ('PP-OCRv4_mobile_rec', None)
Model files already exist. Using cached files. To redownload, please delete the directory manually: /root/.paddlex/official_models/PP-OCRv4_mobile_rec.
✅ PaddleOCR 导入成功
PaddleOCR 版本信息: 使用最新 API
PaddleOCR 文本检测与识别工具
输入文件: /app/images/1.png
输出目录: /app/images
使用 GPU: False
语言: ch
OCR 版本: PP-OCRv4
🚀 初始化 PaddleOCR (版本: PP-OCRv4, 语言: ch)...
✅ PaddleOCR 初始化成功
📄 分析图片: 1.png
📏 图片尺寸: 1005 x 1553
❌ OCR 分析失败: too many values to unpack (expected 3)
❌ 图片分析失败: too many values to unpack (expected 3)
❌ 处理失败!
too many values to unpack (expected 3) 这个报错是啥问题,换了好几个版本都不行!
其他补充信息 Additional Supplementary Information
No response
Reactions are currently unavailable