这里测试的大模型主打一个免费,阿里通义灵码免费,ollama免费。
截屏测试
import pyautogui
import base64
from ollama import Client
from pynput import mouse
初始化大模型客户端
client = Client(
host='http://192.168.0.37:11434',
headers={'x-some-header': 'some-value'}
)
def encode_image(image_path: str) -> str:
with open(image_path, "rb") as image_file:
encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
return f"data:image/jpeg;base64,{encoded_string}"
def capture_screen_region():
print("请按住鼠标左键并拖动来选择矩形区域...")
start_x, start_y = None, None
end_x, end_y = None, None
def on_click(x, y, button, pressed):
nonlocal start_x, start_y, end_x, end_y
if button == mouse.Button.left:
if pressed:
start_x, start_y = x, y
print(f"鼠标按下位置: ({start_x}, {start_y})")
else:
end_x, end_y = x, y
print(f"鼠标释放位置: ({end_x}, {end_y})")
return False # 停止监听
# 创建鼠标监听器
with mouse.Listener(on_click=on_click) as listener:
listener.join()
# 计算矩形区域的左上角和右下角坐标
left = min(start_x, end_x)
top = min(start_y, end_y)
width = abs(end_x - start_x)
height = abs(end_y - start_y)
screenshot = pyautogui.screenshot(region=(left, top, width, height))
screenshot_path = "screenshot.png"
screenshot.save(screenshot_path)
return screenshot_path
def perform_ocr(image_path: str):
# system_prompt = "Convert the content of the image into text."
system_prompt = """Convert the provided image into Markdown format.
Requirements:
- Output Only Markdown: Return solely the Markdown content without any additional explanations or comments.
- No Delimiters: Do not use code fences or delimiters like \`\`\`markdown.
- Complete Content: Do not omit any part of the page, including headers, footers, and subtext.
"""
response = client.chat(
model='llama3.2-vision:11b',
messages=[
{
'role': 'user',
'content': system_prompt,
'images': [image_path],
}
],
)
return response.message.content
if name == "main":
image_path = capture_screen_region()
markdown_content = perform_ocr(image_path)
print(markdown_content)