一、接口概览
1.1 接口简介
t_img是图片翻译专用接口,支持图片中的文字识别与翻译一体化处理,实现"所见即所得"的图片内容翻译,无需手动提取图片中的文字。
1.2 核心特性
✅ OCR+翻译一体化:自动识别图片文字并翻译
✅ 多格式支持:JPG/PNG/GIF/BMP/WEBP等主流格式
✅ 多场景适配:文档、海报、截图、证件、菜单等
✅ 版式保持:保持原图排版布局
✅ 多语言识别:支持200+种语言文字识别
✅ 批量处理:支持大规模图片批量翻译
✅ 智能优化:自动矫正、去噪、增强识别效果
二、准备工作
2.1 环境配置
requirements.txt
requests>=2.28.0
python-dotenv>=1.0.0
pydantic>=2.0.0
aiohttp>=3.8.0
redis>=4.5.0
Pillow>=10.0.0
opencv-python>=4.8.0
numpy>=1.24.0
pytesseract>=0.3.0
google-cloud-vision>=3.0.0
baidu-aip>=4.0.0
tencentcloud-sdk-python>=3.0.0
alibabacloud_ocr_api>=1.0.0
2.2 认证配置
config.py
import os
from dotenv import load_dotenv
from typing import Dict, Any, List
load_dotenv()
class ImageTranslationConfig:
# OCR服务配置
OCR_SERVICES = {
'baidu': {
'app_id': os.getenv('BAIDU_OCR_APP_ID'),
'api_key': os.getenv('BAIDU_OCR_API_KEY'),
'secret_key': os.getenv('BAIDU_OCR_SECRET_KEY'),
'api_base': 'https://aip.baidubce.com/rest/2.0/ocr/v1/'
},
'tencent': {
'secret_id': os.getenv('TENCENT_OCR_SECRET_ID'),
'secret_key': os.getenv('TENCENT_OCR_SECRET_KEY'),
'api_base': 'https://ocr.tencentcloudapi.com'
},
'aliyun': {
'access_key_id': os.getenv('ALIYUN_OCR_ACCESS_KEY_ID'),
'access_key_secret': os.getenv('ALIYUN_OCR_ACCESS_KEY_SECRET'),
'api_base': 'https://ocr.cn-shanghai.aliyuncs.com'
},
'google': {
'api_key': os.getenv('GOOGLE_OCR_API_KEY'),
'api_base': 'https://vision.googleapis.com/v1/images:annotate'
},
'tesseract': {
'lang': 'eng+chi_sim',
'config': '--oem 3 --psm 6'
}
}
# 翻译服务配置
TRANSLATION_SERVICES = {
'baidu': {
'app_id': os.getenv('BAIDU_TRANSLATE_APP_ID'),
'app_key': os.getenv('BAIDU_TRANSLATE_APP_KEY'),
'api_base': 'https://fanyi-api.baidu.com/api/trans/vip/translate'
},
'google': {
'api_key': os.getenv('GOOGLE_TRANSLATE_API_KEY'),
'api_base': 'https://translation.googleapis.com/language/translate/v2'
}
}
# 图片处理配置
IMAGE_PROCESSING = {
'max_size': (4096, 4096), # 最大图片尺寸
'quality': 85, # 图片质量(1-100)
'supported_formats': ['jpg', 'jpeg', 'png', 'gif', 'bmp', 'webp'],
'max_file_size': 10 * 1024 * 1024, # 10MB
'auto_enhance': True, # 自动增强
'denoise': True, # 去噪
'deskew': True, # 纠偏
}
# 请求配置
REQUEST_TIMEOUT = 60
MAX_RETRIES = 3
BATCH_SIZE = 10
MAX_CONCURRENT_REQUESTS = 5
# 缓存配置
CACHE_TTL = 86400 # 24小时
REDIS_URL = os.getenv('REDIS_URL', 'redis://localhost:6379/0')
三、接口详解
3.1 接口地址
POST /image/translate
3.2 请求参数详解
基础参数
参数名
类型
必填
说明
示例
image
file/string
是
图片文件或Base64编码
-
source_lang
string
是
源语言代码
"zh"
target_lang
string
必填
目标语言代码
"en"
ocr_service
string
否
OCR服务
"baidu/tencent/google/tesseract"
translation_service
string
否
翻译服务
"baidu/google"
高级参数
参数名
类型
必填
说明
示例
format
string
否
返回格式
"text/image/json"
preserve_layout
bool
否
保持布局
true
enhance_image
bool
否
图片增强
true
confidence_threshold
float
否
置信度阈值
0.7
detect_language
bool
否
自动检测语言
true
batch_mode
bool
否
批量模式
false
ocr_params
object
否
OCR参数
{"recognize_granularity": "big"}
3.3 返回参数详解
参数名
类型
说明
示例
success
bool
是否成功
true
code
int
状态码
200
message
string
消息
"成功"
data
object
翻译数据
-
original_text
string
原图识别文字
"你好世界"
translated_text
string
翻译后文字
"Hello World"
translated_image
string
翻译后图片(Base64)
"data:image/png;base64,..."
ocr_confidence
float
OCR置信度
0.92
translation_confidence
float
翻译置信度
0.88
detected_language
string
检测到的语言
"zh"
text_regions
array
文字区域信息
[...]
processing_time
float
处理时间
2.345
四、完整代码实现
4.1 Python完整实现
import requests
import time
import hashlib
import hmac
import json
import base64
import io
import tempfile
import os
from typing import Dict, Any, List, Optional, Union, Tuple
from datetime import datetime, timedelta
from dataclasses import dataclass, field
from pathlib import Path
from urllib.parse import urlencode
import redis
import asyncio
import aiohttp
from PIL import Image, ImageEnhance, ImageFilter
import cv2
import numpy as np
import pytesseract
from google.cloud import vision
from aip import AipOcr
from tencentcloud.common import credential
from tencentcloud.ocr.v20181119 import ocr_client, models
@dataclass
class ImageTranslationRequest:
"""图片翻译请求"""
image: Union[str, bytes, Image.Image] # 图片路径/Base64/二进制/PIL对象
source_lang: str
target_lang: str
ocr_service: str = "baidu"
translation_service: str = "baidu"
format: str = "text" # text/image/json
preserve_layout: bool = True
enhance_image: bool = True
confidence_threshold: float = 0.7
detect_language: bool = True
ocr_params: Dict[str, Any] = field(default_factory=dict)
translation_params: Dict[str, Any] = field(default_factory=dict)
@dataclass
class TextRegion:
"""文字区域信息"""
text: str
confidence: float
bounding_box: List[Tuple[int, int]] # [(x1,y1), (x2,y2), (x3,y3), (x4,y4)]
font_size: Optional[int] = None
font_color: Optional[str] = None
background_color: Optional[str] = None
language: Optional[str] = None
@dataclass
class ImageTranslationResult:
"""图片翻译结果"""
success: bool
code: int
message: str
data: Dict[str, Any]
original_text: str
translated_text: str
translated_image: Optional[str] = None # Base64编码
source_lang: str
target_lang: str
detected_language: Optional[str] = None
ocr_confidence: float
translation_confidence: float
text_regions: List[TextRegion]
ocr_service: str
translation_service: str
processing_time: float
warnings: List[str] = field(default_factory=list)
@dataclass
class BatchImageTranslationResult:
"""批量图片翻译结果"""
success: bool
code: int
message: str
data: Dict[str, Any]
results: List[ImageTranslationResult]
total_count: int
success_count: int
failed_count: int
total_time: float
class ImageTranslationAPI:
"""图片翻译API客户端"""
def __init__(self, config: Dict[str, Any], redis_client=None):
self.config = config
self.redis = redis_client
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Image-Translation-API/1.0',
'Accept': 'application/json'
})
# 初始化OCR客户端
self.ocr_clients = self._init_ocr_clients()
# 初始化翻译客户端
self.translation_clients = self._init_translation_clients()
def _init_ocr_clients(self) -> Dict[str, Any]:
"""初始化OCR客户端"""
clients = {}
ocr_config = self.config.get('OCR_SERVICES', {})
# 百度OCR
if ocr_config.get('baidu'):
baidu_config = ocr_config['baidu']
clients['baidu'] = AipOcr(
baidu_config.get('app_id'),
baidu_config.get('api_key'),
baidu_config.get('secret_key')
)
# Google Vision
if ocr_config.get('google'):
google_config = ocr_config['google']
clients['google'] = vision.ImageAnnotatorClient()
# 腾讯OCR
if ocr_config.get('tencent'):
tencent_config = ocr_config['tencent']
cred = credential.Credential(
tencent_config.get('secret_id'),
tencent_config.get('secret_key')
)
clients['tencent'] = ocr_client.OcrClient(cred, "ap-beijing")
# Tesseract OCR
if ocr_config.get('tesseract'):
clients['tesseract'] = pytesseract
return clients
def _init_translation_clients(self) -> Dict[str, Any]:
"""初始化翻译客户端"""
clients = {}
translation_config = self.config.get('TRANSLATION_SERVICES', {})
# 百度翻译
if translation_config.get('baidu'):
baidu_config = translation_config['baidu']
clients['baidu'] = BaiduTranslationClient(baidu_config)
# Google翻译
if translation_config.get('google'):
google_config = translation_config['google']
clients['google'] = GoogleTranslationClient(google_config)
return clients
def translate_image(
self,
image: Union[str, bytes, Image.Image],
source_lang: str,
target_lang: str,
**kwargs
) -> ImageTranslationResult:
"""
图片翻译
Args:
image: 图片(路径/Base64/二进制/PIL对象)
source_lang: 源语言
target_lang: 目标语言
**kwargs: 其他参数
Returns:
图片翻译结果
"""
start_time = time.time()
# 构建请求
request = ImageTranslationRequest(
image=image,
source_lang=source_lang,
target_lang=target_lang,
**kwargs
)
# 检查缓存
cache_key = self._get_cache_key(request)
if self.redis:
cached = self.redis.get(cache_key)
if cached:
data = json.loads(cached)
return ImageTranslationResult(**data)
try:
# 1. 预处理图片
processed_image = self._preprocess_image(request.image)
# 2. OCR文字识别
ocr_result = self._perform_ocr(processed_image, request)
if not ocr_result.success or not ocr_result.text_regions:
return ImageTranslationResult(
success=False,
code=400,
message="图片文字识别失败",
data={},
original_text="",
translated_text="",
source_lang=source_lang,
target_lang=target_lang,
detected_language=None,
ocr_confidence=0.0,
translation_confidence=0.0,
text_regions=[],
ocr_service=request.ocr_service,
translation_service=request.translation_service,
processing_time=time.time() - start_time
)
# 3. 合并识别文字
original_text = self._combine_text_regions(ocr_result.text_regions)
# 4. 文本翻译
translation_result = self._translate_text(
original_text,
source_lang,
target_lang,
request
)
if not translation_result.success:
return ImageTranslationResult(
success=False,
code=400,
message=f"文本翻译失败: {translation_result.message}",
data={},
original_text=original_text,
translated_text="",
source_lang=source_lang,
target_lang=target_lang,
detected_language=ocr_result.detected_language,
ocr_confidence=ocr_result.avg_confidence,
translation_confidence=0.0,
text_regions=ocr_result.text_regions,
ocr_service=request.ocr_service,
translation_service=request.translation_service,
processing_time=time.time() - start_time
)
# 5. 生成翻译后图片(如果需要)
translated_image = None
if request.format == "image" and request.preserve_layout:
translated_image = self._generate_translated_image(
processed_image,
ocr_result.text_regions,
translation_result.translated_text
)
# 构建结果
result = ImageTranslationResult(
success=True,
code=200,
message="成功",
data={
'ocr_result': ocr_result.data,
'translation_result': translation_result.data
},
original_text=original_text,
translated_text=translation_result.translated_text,
translated_image=translated_image,
source_lang=source_lang,
target_lang=target_lang,
detected_language=ocr_result.detected_language,
ocr_confidence=ocr_result.avg_confidence,
translation_confidence=translation_result.confidence,
text_regions=ocr_result.text_regions,
ocr_service=request.ocr_service,
translation_service=request.translation_service,
processing_time=time.time() - start_time
)
# 缓存结果
if self.redis:
self.redis.setex(
cache_key,
self.config.get('CACHE_TTL', 86400),
json.dumps(result.__dict__)
)
return result
except Exception as e:
processing_time = time.time() - start_time
return ImageTranslationResult(
success=False,
code=500,
message=f"图片翻译失败: {str(e)}",
data={},
original_text="",
translated_text="",
source_lang=source_lang,
target_lang=target_lang,
detected_language=None,
ocr_confidence=0.0,
translation_confidence=0.0,
text_regions=[],
ocr_service=request.ocr_service,
translation_service=request.translation_service,
processing_time=processing_time
)
def _preprocess_image(self, image: Union[str, bytes, Image.Image]) -> Image.Image:
"""图片预处理"""
# 转换为PIL Image
if isinstance(image, str):
# 路径或Base64
if image.startswith('data:image'):
# Base64编码
image_data = base64.b64decode(image.split(',')[1])
img = Image.open(io.BytesIO(image_data))
elif os.path.exists(image):
# 文件路径
img = Image.open(image)
else:
raise ValueError(f"无效的图片路径或Base64: {image}")
elif isinstance(image, bytes):
# 二进制数据
img = Image.open(io.BytesIO(image))
elif isinstance(image, Image.Image):
# PIL对象
img = image
else:
raise ValueError("不支持的图片格式")
# 图片增强
img_config = self.config.get('IMAGE_PROCESSING', {})
if img_config.get('auto_enhance', True):
# 调整大小
max_size = img_config.get('max_size', (4096, 4096))
img.thumbnail(max_size, Image.Resampling.LANCZOS)
# 转换为RGB
if img.mode != 'RGB':
img = img.convert('RGB')
# 增强对比度
enhancer = ImageEnhance.Contrast(img)
img = enhancer.enhance(1.2)
# 增强锐度
enhancer = ImageEnhance.Sharpness(img)
img = enhancer.enhance(1.1)
# 去噪
if img_config.get('denoise', True):
img = img.filter(ImageFilter.MedianFilter(size=3))
return img
def _perform_ocr(
self,
image: Image.Image,
request: ImageTranslationRequest
) -> OCRResult:
"""执行OCR文字识别"""
service = request.ocr_service
params = request.ocr_params
if service == 'baidu':
return self._baidu_ocr(image, params)
elif service == 'google':
return self._google_ocr(image, params)
elif service == 'tencent':
return self._tencent_ocr(image, params)
elif service == 'tesseract':
return self._tesseract_ocr(image, params)
else:
raise ValueError(f"不支持的OCR服务: {service}")
def _baidu_ocr(self, image: Image.Image, params: Dict[str, Any]) -> OCRResult:
"""百度OCR"""
client = self.ocr_clients.get('baidu')
if not client:
raise ValueError("百度OCR客户端未初始化")
# 转换为Base64
img_buffer = io.BytesIO()
image.save(img_buffer, format='JPEG', quality=85)
img_base64 = base64.b64encode(img_buffer.getvalue()).decode()
# 构建请求参数
ocr_params = {
'image': img_base64,
'recognize_granularity': 'big',
'language_type': 'CHN_ENG',
'detect_direction': True
}
ocr_params.update(params)
# 发送请求
response = client.general(ocr_params)
if 'words_result' in response:
text_regions = []
total_confidence = 0.0
detected_languages = set()
for item in response['words_result']:
text = item.get('words', '')
confidence = item.get('probability', {}).get('average', 0.0) if isinstance(item.get('probability'), dict) else 0.8
# 解析位置信息
location = item.get('location', {})
bounding_box = [
(location.get('left', 0), location.get('top', 0)),
(location.get('left', 0) + location.get('width', 0), location.get('top', 0)),
(location.get('left', 0) + location.get('width', 0), location.get('top', 0) + location.get('height', 0)),
(location.get('left', 0), location.get('top', 0) + location.get('height', 0))
]
text_region = TextRegion(
text=text,
confidence=confidence,
bounding_box=bounding_box
)
text_regions.append(text_region)
total_confidence += confidence
avg_confidence = total_confidence / len(text_regions) if text_regions else 0.0
return OCRResult(
success=True,
text_regions=text_regions,
avg_confidence=avg_confidence,
detected_language='zh' if detected_languages else None,
data=response
)
else:
error_msg = response.get('error_msg', 'OCR识别失败')
return OCRResult(
success=False,
text_regions=[],
avg_confidence=0.0,
detected_language=None,
data=response,
error=error_msg
)
def _google_ocr(self, image: Image.Image, params: Dict[str, Any]) -> OCRResult:
"""Google Vision OCR"""
client = self.ocr_clients.get('google')
if not client:
raise ValueError("Google Vision客户端未初始化")
# 转换为字节
img_buffer = io.BytesIO()
image.save(img_buffer, format='JPEG', quality=85)
image_content = img_buffer.getvalue()
# 构建请求
vision_image = vision.Image(content=image_content)
# 发送请求
response = client.text_detection(image=vision_image)
texts = response.text_annotations
if texts:
# 第一个结果是完整的文本
full_text = texts[0].description
text_regions = []
# 处理文字区域
for i, text in enumerate(texts[1:], 1): # 跳过第一个完整文本
vertices = text.bounding_poly.vertices
bounding_box = [
(vertices[0].x, vertices[0].y),
(vertices[1].x, vertices[1].y),
(vertices[2].x, vertices[2].y),
(vertices[3].x, vertices[3].y)
]
text_region = TextRegion(
text=text.description,
confidence=0.9, # Google OCR通常较高
bounding_box=bounding_box
)
text_regions.append(text_region)
avg_confidence = 0.9 # Google OCR默认置信度
return OCRResult(
success=True,
text_regions=text_regions,
avg_confidence=avg_confidence,
detected_language=self._detect_language(full_text),
data=response.to_dict()
)
else:
return OCRResult(
success=False,
text_regions=[],
avg_confidence=0.0,
detected_language=None,
data=response.to_dict(),
error="未识别到文字"
)
def _tesseract_ocr(self, image: Image.Image, params: Dict[str, Any]) -> OCRResult:
"""Tesseract OCR"""
try:
# 配置参数
config = params.get('config', '--oem 3 --psm 6')
lang = params.get('lang', 'eng+chi_sim')
# 执行OCR
data = pytesseract.image_to_data(image, config=config, lang=lang, output_type=pytesseract.Output.DICT)
text_regions = []
total_confidence = 0.0
valid_count = 0
for i in range(len(data['text'])):
text = data['text'][i].strip()
confidence = float(data['conf'][i])
if text and confidence > 0:
# 获取边界框
x, y, w, h = data['left'][i], data['top'][i], data['width'][i], data['height'][i]
bounding_box = [
(x, y),
(x + w, y),
(x + w, y + h),
(x, y + h)
]
text_region = TextRegion(
text=text,
confidence=confidence / 100.0, # 转换为0-1
bounding_box=bounding_box
)
text_regions.append(text_region)
total_confidence += confidence / 100.0
valid_count += 1
avg_confidence = total_confidence / valid_count if valid_count > 0 else 0.0
# 合并文本
full_text = ' '.join([region.text for region in text_regions])
return OCRResult(
success=valid_count > 0,
text_regions=text_regions,
avg_confidence=avg_confidence,
detected_language=self._detect_language(full_text),
data=data
)
except Exception as e:
return OCRResult(
success=False,
text_regions=[],
avg_confidence=0.0,
detected_language=None,
data={},
error=str(e)
)
def _translate_text(
self,
text: str,
source_lang: str,
target_lang: str,
request: ImageTranslationRequest
) -> TranslationResult:
"""翻译文本"""
service = request.translation_service
params = request.translation_params
if service == 'baidu':
return self._baidu_translate(text, source_lang, target_lang, params)
elif service == 'google':
return self._google_translate(text, source_lang, target_lang, params)
else:
raise ValueError(f"不支持的翻译服务: {service}")
def _baidu_translate(
self,
text: str,
source_lang: str,
target_lang: str,
params: Dict[str, Any]
) -> TranslationResult:
"""百度翻译"""
client = self.translation_clients.get('baidu')
if not client:
raise ValueError("百度翻译客户端未初始化")
# 构建请求参数
translate_params = {
'q': text,
'from': source_lang,
'to': target_lang
}
translate_params.update(params)
# 发送请求
result = client.translate(**translate_params)
if 'trans_result' in result:
translated_text = result['trans_result'][0]['dst']
return TranslationResult(
success=True,
translated_text=translated_text,
confidence=0.9,
data=result
)
else:
error_msg = result.get('error_msg', '翻译失败')
return TranslationResult(
success=False,
translated_text="",
confidence=0.0,
data=result,
error=error_msg
)
def _generate_translated_image(
self,
original_image: Image.Image,
text_regions: List[TextRegion],
translated_text: str
) -> Optional[str]:
"""生成翻译后的图片(保持布局)"""
try:
# 创建副本
translated_image = original_image.copy()
draw = ImageDraw.Draw(translated_image)
# 获取字体
try:
font = ImageFont.truetype("arial.ttf", 16)
except:
font = ImageFont.load_default()
# 按行处理文本
lines = translated_text.split('\n')
for i, line in enumerate(lines):
if i < len(text_regions):
region = text_regions[i]
bbox = region.bounding_box
# 计算文本位置(居中)
x1, y1 = bbox[0]
x2, y2 = bbox[2]
center_x = (x1 + x2) // 2
center_y = (y1 + y2) // 2
# 绘制白色背景
text_width, text_height = draw.textsize(line, font=font)
bg_x1 = center_x - text_width // 2 - 2
bg_y1 = center_y - text_height // 2 - 2
bg_x2 = center_x + text_width // 2 + 2
bg_y2 = center_y + text_height // 2 + 2
draw.rectangle([bg_x1, bg_y1, bg_x2, bg_y2], fill='white')
# 绘制文本
draw.text((center_x, center_y), line, fill='black', font=font, anchor='mm')
# 转换为Base64
img_buffer = io.BytesIO()
translated_image.save(img_buffer, format='PNG')
img_base64 = base64.b64encode(img_buffer.getvalue()).decode()
return f"data:image/png;base64,{img_base64}"
except Exception as e:
print(f"生成翻译图片失败: {e}")
return None
def _get_cache_key(self, request: ImageTranslationRequest) -> str:
"""生成缓存键"""
import hashlib
# 获取图片指纹
if isinstance(request.image, Image.Image):
img_buffer = io.BytesIO()
request.image.save(img_buffer, format='JPEG', quality=85)
image_hash = hashlib.md5(img_buffer.getvalue()).hexdigest()
elif isinstance(request.image, str) and request.image.startswith('data:image'):
image_data = base64.b64decode(request.image.split(',')[1])
image_hash = hashlib.md5(image_data).hexdigest()
elif isinstance(request.image, bytes):
image_hash = hashlib.md5(request.image).hexdigest()
else:
image_hash = hashlib.md5(str(request.image).encode()).hexdigest()
key_data = f"{image_hash}:{request.source_lang}:{request.target_lang}:{request.ocr_service}:{request.translation_service}"
return f"img_trans:{hashlib.md5(key_data.encode()).hexdigest()}"
辅助类
@dataclass
class OCRResult:
"""OCR结果"""
success: bool
text_regions: List[TextRegion]
avg_confidence: float
detected_language: Optional[str]
data: Dict[str, Any]
error: Optional[str] = None
@dataclass
class TranslationResult:
"""翻译结果"""
success: bool
translated_text: str
confidence: float
data: Dict[str, Any]
error: Optional[str] = None
翻译客户端
class BaiduTranslationClient:
"""百度翻译客户端"""
def __init__(self, config: Dict[str, Any]):
self.config = config
self.session = requests.Session()
def translate(self, q: str, from_lang: str, to_lang: str, **kwargs) -> Dict[str, Any]:
"""百度翻译"""
import random
import hashlib
app_id = self.config.get('app_id')
app_key = self.config.get('app_key')
api_base = self.config.get('api_base')
salt = str(random.randint(32768, 65536))
sign_str = app_id + q + salt + app_key
sign = hashlib.md5(sign_str.encode('utf-8')).hexdigest()
params = {
'q': q,
'from': from_lang,
'to': to_lang,
'appid': app_id,
'salt': salt,
'sign': sign
}
params.update(kwargs)
response = self.session.post(api_base, data=params, timeout=30)
return response.json()
class GoogleTranslationClient:
"""Google翻译客户端"""
def __init__(self, config: Dict[str, Any]):
self.config = config
self.session = requests.Session()
def translate(self, q: str, source: str, target: str, **kwargs) -> Dict[str, Any]:
"""Google翻译"""
api_key = self.config.get('api_key')
api_base = self.config.get('api_base')
params = {
'q': q,
'source': source,
'target': target,
'key': api_key
}
params.update(kwargs)
response = self.session.post(api_base, data=params, timeout=30)
return response.json()
4.2 异步实现
import aiohttp
from typing import List, Dict, Any, Union
class AsyncImageTranslationAPI:
"""异步图片翻译API"""
def __init__(self, config: Dict[str, Any], redis_client=None):
self.config = config
self.redis = redis_client
self.session = None
self.ocr_clients = self._init_ocr_clients()
self.translation_clients = self._init_translation_clients()
async def __aenter__(self):
self.session = aiohttp.ClientSession()
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
await self.session.close()
async def translate_image_async(
self,
image: Union[str, bytes],
source_lang: str,
target_lang: str,
**kwargs
) -> ImageTranslationResult:
"""异步图片翻译"""
start_time = time.time()
# 构建请求
request = ImageTranslationRequest(
image=image,
source_lang=source_lang,
target_lang=target_lang,
**kwargs
)
# 检查缓存
cache_key = self._get_cache_key(request)
if self.redis:
cached = await self.redis.get(cache_key)
if cached:
data = json.loads(cached)
return ImageTranslationResult(**data)
try:
# 预处理图片
processed_image = await self._preprocess_image_async(request.image)
# OCR识别
ocr_result = await self._perform_ocr_async(processed_image, request)
if not ocr_result.success:
return ImageTranslationResult(
success=False,
code=400,
message="OCR识别失败",
data={},
original_text="",
translated_text="",
source_lang=source_lang,
target_lang=target_lang,
detected_language=None,
ocr_confidence=0.0,
translation_confidence=0.0,
text_regions=[],
ocr_service=request.ocr_service,
translation_service=request.translation_service,
processing_time=time.time() - start_time
)
# 合并文本
original_text = self._combine_text_regions(ocr_result.text_regions)
# 翻译
translation_result = await self._translate_text_async(
original_text, source_lang, target_lang, request
)
if not translation_result.success:
return ImageTranslationResult(
success=False,
code=400,
message=f"翻译失败: {translation_result.message}",
data={},
original_text=original_text,
translated_text="",
source_lang=source_lang,
target_lang=target_lang,
detected_language=ocr_result.detected_language,
ocr_confidence=ocr_result.avg_confidence,
translation_confidence=0.0,
text_regions=ocr_result.text_regions,
ocr_service=request.ocr_service,
translation_service=request.translation_service,
processing_time=time.time() - start_time
)
# 生成结果
result = ImageTranslationResult(
success=True,
code=200,
message="成功",
data={
'ocr_result': ocr_result.data,
'translation_result': translation_result.data
},
original_text=original_text,
translated_text=translation_result.translated_text,
source_lang=source_lang,
target_lang=target_lang,
detected_language=ocr_result.detected_language,
ocr_confidence=ocr_result.avg_confidence,
translation_confidence=translation_result.confidence,
text_regions=ocr_result.text_regions,
ocr_service=request.ocr_service,
translation_service=request.translation_service,
processing_time=time.time() - start_time
)
# 缓存
if self.redis:
await self.redis.setex(
cache_key,
self.config.get('CACHE_TTL', 86400),
json.dumps(result.__dict__)
)
return result
except Exception as e:
processing_time = time.time() - start_time
return ImageTranslationResult(
success=False,
code=500,
message=f"图片翻译失败: {str(e)}",
data={},
original_text="",
translated_text="",
source_lang=source_lang,
target_lang=target_lang,
detected_language=None,
ocr_confidence=0.0,
translation_confidence=0.0,
text_regions=[],
ocr_service=request.ocr_service,
translation_service=request.translation_service,
processing_time=processing_time
)
async def _preprocess_image_async(self, image: Union[str, bytes]) -> Image.Image:
"""异步图片预处理"""
# 从Base64或二进制转换为PIL Image
if isinstance(image, str) and image.startswith('data:image'):
image_data = base64.b64decode(image.split(',')[1])
img = Image.open(io.BytesIO(image_data))
elif isinstance(image, bytes):
img = Image.open(io.BytesIO(image))
else:
raise ValueError("不支持的图片格式")
# 同步预处理(PIL不支持异步)
return self._preprocess_image_sync(img)
def _preprocess_image_sync(self, image: Image.Image) -> Image.Image:
"""同步图片预处理"""
# 同同步版本的预处理逻辑
img_config = self.config.get('IMAGE_PROCESSING', {})
if img_config.get('auto_enhance', True):
max_size = img_config.get('max_size', (4096, 4096))
image.thumbnail(max_size, Image.Resampling.LANCZOS)
if image.mode != 'RGB':
image = image.convert('RGB')
enhancer = ImageEnhance.Contrast(image)
image = enhancer.enhance(1.2)
enhancer = ImageEnhance.Sharpness(image)
image = enhancer.enhance(1.1)
return image
async def _perform_ocr_async(
self,
image: Image.Image,
request: ImageTranslationRequest
) -> OCRResult:
"""异步OCR识别"""
service = request.ocr_service
if service == 'baidu':
return await self._baidu_ocr_async(image, request.ocr_params)
elif service == 'google':
return await self._google_ocr_async(image, request.ocr_params)
else:
raise ValueError(f"不支持的OCR服务: {service}")
async def _baidu_ocr_async(
self,
image: Image.Image,
params: Dict[str, Any]
) -> OCRResult:
"""异步百度OCR"""
client = self.ocr_clients.get('baidu')
if not client:
raise ValueError("百度OCR客户端未初始化")
# 转换为Base64
img_buffer = io.BytesIO()
image.save(img_buffer, format='JPEG', quality=85)
img_base64 = base64.b64encode(img_buffer.getvalue()).decode()
# 构建请求
ocr_params = {
'image': img_base64,
'recognize_granularity': 'big',
'language_type': 'CHN_ENG',
'detect_direction': True
}
ocr_params.update(params)
# 发送异步请求
try:
# 百度AIP不支持异步,这里使用线程池
import asyncio
from concurrent.futures import ThreadPoolExecutor
with ThreadPoolExecutor() as executor:
response = await asyncio.get_event_loop().run_in_executor(
executor, client.general, ocr_params
)
# 解析结果
if 'words_result' in response:
text_regions = []
total_confidence = 0.0
for item in response['words_result']:
text = item.get('words', '')
confidence = item.get('probability', {}).get('average', 0.0) if isinstance(item.get('probability'), dict) else 0.8
location = item.get('location', {})
bounding_box = [
(location.get('left', 0), location.get('top', 0)),
(location.get('left', 0) + location.get('width', 0), location.get('top', 0)),
(location.get('left', 0) + location.get('width', 0), location.get('top', 0) + location.get('height', 0)),
(location.get('left', 0), location.get('top', 0) + location.get('height', 0))
]
text_region = TextRegion(
text=text,
confidence=confidence,
bounding_box=bounding_box
)
text_regions.append(text_region)
total_confidence += confidence
avg_confidence = total_confidence / len(text_regions) if text_regions else 0.0
return OCRResult(
success=True,
text_regions=text_regions,
avg_confidence=avg_confidence,
detected_language='zh',
data=response
)
else:
error_msg = response.get('error_msg', 'OCR识别失败')
return OCRResult(
success=False,
text_regions=[],
avg_confidence=0.0,
detected_language=None,
data=response,
error=error_msg
)
except Exception as e:
return OCRResult(
success=False,
text_regions=[],
avg_confidence=0.0,
detected_language=None,
data={},
error=str(e)
)
async def batch_translate_images_async(
self,
images: List[Union[str, bytes]],
source_lang: str,
target_lang: str,
**kwargs
) -> BatchImageTranslationResult:
"""异步批量图片翻译"""
start_time = time.time()
tasks = []
for image in images:
task = self.translate_image_async(image, source_lang, target_lang, **kwargs)
tasks.append(task)
results = await asyncio.gather(*tasks, return_exceptions=True)
# 处理结果
success_count = 0
failed_count = 0
processed_results = []
for result in results:
if isinstance(result, Exception):
failed_count += 1
error_result = ImageTranslationResult(
success=False,
code=500,
message=str(result),
data={},
original_text="",
translated_text="",
source_lang=source_lang,
target_lang=target_lang,
detected_language=None,
ocr_confidence=0.0,
translation_confidence=0.0,
text_regions=[],
ocr_service=kwargs.get('ocr_service', 'baidu'),
translation_service=kwargs.get('translation_service', 'baidu'),
processing_time=0.0
)
processed_results.append(error_result)
else:
if result.success:
success_count += 1
else:
failed_count += 1
processed_results.append(result)
total_time = time.time() - start_time
return BatchImageTranslationResult(
success=success_count > 0,
code=200 if success_count > 0 else 500,
message=f"批量翻译完成,成功{success_count}条,失败{failed_count}条",
data={},
results=processed_results,
total_count=len(images),
success_count=success_count,
failed_count=failed_count,
total_time=total_time
)
五、使用示例
5.1 基础使用
初始化API
api = ImageTranslationAPI(ImageTranslationConfig)
1. 从文件翻译
result = api.translate_image(
image="path/to/image.jpg",
source_lang="zh",
target_lang="en",
ocr_service="baidu",
translation_service="baidu"
)
if result.success:
print(f"原图文字: {result.original_text}")
print(f"翻译结果: {result.translated_text}")
print(f"OCR置信度: {result.ocr_confidence:.2f}")
print(f"翻译耗时: {result.processing_time:.2f}s")
# 保存翻译后图片
if result.translated_image:
with open("translated_image.png", "wb") as f:
image_data = base64.b64decode(result.translated_image.split(',')[1])
f.write(image_data)
2. 从Base64翻译
with open("image.jpg", "rb") as f:
image_data = f.read()
image_base64 = base64.b64encode(image_data).decode()
result = api.translate_image(
image=f"data:image/jpeg;base64,{image_base64}",
source_lang="zh",
target_lang="en"
)
3. 批量翻译
images = ["image1.jpg", "image2.png", "image3.jpeg"]
results = api.batch_translate_images(images, "zh", "en")
for i, result in enumerate(results):
if result.success:
print(f"图片{i+1}: {result.translated_text}")
5.2 高级功能
保持布局翻译
result = api.translate_image(
image="document.png",
source_lang="zh",
target_lang="en",
format="image", # 返回翻译后图片
preserve_layout=True,
enhance_image=True
)
自定义OCR参数
result = api.translate_image(
image="menu.jpg",
source_lang="ja",
target_lang="zh",
ocr_service="google",
ocr_params={
"language_hints": ["ja"],
"enable_text_detection_confidence_score": True
}
)
自动检测语言
result = api.translate_image(
image="unknown_text.jpg",
source_lang="auto",
target_lang="zh",
detect_language=True
)
六、实战应用场景
6.1 文档翻译系统
class DocumentTranslationSystem:
"""文档翻译系统"""
def __init__(self, image_api, file_api):
self.image_api = image_api
self.file_api = file_api
def translate_document_images(
self,
document_path: str,
source_lang: str,
target_lang: str
) -> List[ImageTranslationResult]:
"""翻译文档中的图片"""
# 提取文档中的图片
images = self._extract_images_from_document(document_path)
# 翻译图片
results = []
for image_path in images:
result = self.image_api.translate_image(
image=image_path,
source_lang=source_lang,
target_lang=target_lang,
format="image", # 返回翻译后图片
preserve_layout=True
)
results.append(result)
return results
def _extract_images_from_document(self, document_path: str) -> List[str]:
"""从文档中提取图片"""
import fitz # PyMuPDF
images = []
if document_path.lower().endswith('.pdf'):
doc = fitz.open(document_path)
for page_num in range(len(doc)):
page = doc.load_page(page_num)
image_list = page.get_images()
for img in image_list:
# 提取图片
xref = img[0]
pix = fitz.Pixmap(doc, xref)
if pix.n - pix.alpha < 4: # 非透明图片
image_path = f"temp_image_{page_num}_{len(images)}.png"
pix.save(image_path)
images.append(image_path)
pix = None
doc.close()
return images
6.2 移动端图片翻译
class MobileImageTranslator:
"""移动端图片翻译器"""
def __init__(self, api_client):
self.api = api_client
def translate_camera_image(
self,
image_data: bytes,
source_lang: str,
target_lang: str
) -> ImageTranslationResult:
"""翻译摄像头拍摄的图片"""
# 预处理(适合移动端)
image = Image.open(io.BytesIO(image_data))
# 调整大小
image.thumbnail((1024, 1024), Image.Resampling.LANCZOS)
# 增强
enhancer = ImageEnhance.Contrast(image)
image = enhancer.enhance(1.3)
# 转换为Base64
img_buffer = io.BytesIO()
image.save(img_buffer, format='JPEG', quality=80)
image_base64 = base64.b64encode(img_buffer.getvalue()).decode()
# 翻译
result = self.api.translate_image(
image=f"data:image/jpeg;base64,{image_base64}",
source_lang=source_lang,
target_lang=target_lang,
ocr_service="baidu", # 百度OCR对中文支持好
translation_service="baidu"
)
return result
七、故障排查与优化
7.1 常见问题解决
问题1:图片质量不佳导致识别失败
def enhance_image_for_ocr(image: Image.Image) -> Image.Image:
"""增强图片质量以提高OCR准确率"""
# 转换为灰度图
if image.mode != 'L':
image = image.convert('L')
# 增强对比度
enhancer = ImageEnhance.Contrast(image)
image = enhancer.enhance(2.0)
# 增强亮度
enhancer = ImageEnhance.Brightness(image)
image = enhancer.enhance(1.2)
# 二值化
image = image.point(lambda x: 0 if x < 128 else 255, '1')
return image
问题2:文字方向检测
def detect_text_orientation(image: Image.Image) -> int:
"""检测文字方向"""
try:
# 使用Tesseract检测方向
osd = pytesseract.image_to_osd(image)
# 解析结果
rotation = 0
for line in osd.split('\n'):
if 'Rotate' in line:
rotation = int(line.split(':')[1].strip())
break
return rotation
except:
return 0
7.2 性能优化建议
图片预处理优化
def optimize_image_for_ocr(image: Image.Image, target_size: Tuple[int, int] = (2048, 2048)) -> Image.Image:
"""优化图片用于OCR"""
# 调整大小
image.thumbnail(target_size, Image.Resampling.LANCZOS)
# 转换为RGB
if image.mode != 'RGB':
image = image.convert('RGB')
# 自动白平衡
image_array = np.array(image)
result = cv2.cvtColor(image_array, cv2.COLOR_RGB2LAB)
avg_a = np.average(result[:, :, 1])
avg_b = np.average(result[:, :, 2])
result[:, :, 1] = result[:, :, 1] - ((avg_a - 128) * (result[:, :, 0] / 255.0) * 1.1)
result[:, :, 2] = result[:, :, 2] - ((avg_b - 128) * (result[:, :, 0] / 255.0) * 1.1)
result = cv2.cvtColor(result, cv2.COLOR_LAB2RGB)
return Image.fromarray(result)
智能服务选择
def select_optimal_ocr_service(image: Image.Image, detected_language: str) -> str:
"""根据图片特征选择最佳OCR服务"""
# 分析图片特征
image_array = np.array(image)
# 计算文字密度
text_density = calculate_text_density(image_array)
# 根据语言选择
if detected_language in ['zh', 'ja', 'ko']:
if text_density > 0.3: # 文字密集
return 'baidu' # 百度对中文支持好
else:
return 'google' # Google对复杂布局支持好
else:
return 'google' # Google对英文支持好
八、最佳实践总结
8.1 核心优势
一体化处理:OCR+翻译无缝集成
多格式支持:支持主流图片格式
智能优化:自动图片增强和文字检测
布局保持:生成翻译后图片保持原布局
批量处理:支持大规模图片批量翻译
8.2 使用建议
图片质量:确保图片清晰,文字清晰可辨
服务选择:中文图片用百度OCR,复杂布局用Google Vision
参数调优:根据图片类型调整OCR参数
缓存策略:对相同图片使用缓存减少重复处理
8.3 部署建议
异步处理:使用异步API提高并发性能
图片压缩:预处理阶段压缩图片减少传输时间
错误重试:对网络错误和API限流进行重试
监控告警:设置处理时间和成功率监控
附录:快速开始模板
quick_start.py
from image_translation_api import ImageTranslationAPI, ImageTranslationConfig
import redis
1. 初始化客户端
redis_client = redis.Redis.from_url(ImageTranslationConfig.REDIS_URL)
api = ImageTranslationAPI(ImageTranslationConfig, redis_client)
2. 简单翻译
result = api.translate_image(
image="example.jpg",
source_lang="zh",
target_lang="en"
)
if result.success:
print(f"识别文字: {result.original_text}")
print(f"翻译结果: {result.translated_text}")
# 保存翻译后图片
if result.translated_image:
with open("translated.png", "wb") as f:
image_data = base64.b64decode(result.translated_image.split(',')[1])
f.write(image_data)
3. 批量翻译
images = ["img1.jpg", "img2.png", "img3.jpeg"]
results = api.batch_translate_images(images, "zh", "en")
for i, result in enumerate(results):
if result.success:
print(f"图片{i+1}翻译成功: {result.translated_text}")
通过本攻略,您应该能够:
理解图片翻译接口的完整功能和参数配置
实现OCR+翻译一体化处理流程
处理各种图片格式和文字识别场景
构建高性能的图片翻译应用系统
在实际业务中灵活应用图片翻译功能
建议根据实际业务需求选择合适的OCR和翻译服务组合,并遵循最佳实践确保系统的稳定性和可维护性。