LLM的测试工具:LaVague平替成国内大模型
laVague 是将自然语言转化成浏览器交互的操作,完成自动化测试的大模型的agent。
LaVague介绍
LaVague 通过LLM将自然语言转换Selenium的代码引擎,用户或其他人工智能轻松实现自动化。
LaVague通过Llama Index实现了自然语言到python的selenium代码的编写能力,例子中提供了在线调用huggingface的LLM以及本地LLM两种方式,在线调用huggingface的Nous-Hermes-2-Mixtral-8x7B-DPO模型和BAAI/bge-small-en-v1.5的embedding模型实现了上面代码生成,但是要试用huggingface的api必须是pro付费会员,而且访问起来也不方便。
本地大模型的方式需要将模型现在到本地,并且在本地的显里面,那么开发笔记本就需要一个顶级配置的显卡,笔者也没办法解决。
想要尝鲜,怎么办?
智谱免费大模型平替huggingface的付费模型
智谱提供了embedding模型,并且免费账号提供100万个token,实名制再送400万。笔者仔细一算,还是够用的,因此就开始走上了智普大模型完成LaVague的大模型部分的替换工作。(注册步骤省略)
LaVague是基于llama index完成的开发,那么并且提供了CustomLLM类方便自己扩充,因此我们基于这个构建一个调取智谱大模型的ChatGLM类
class ChatGLM(CustomLLM):
num_output: int = DEFAULT_NUM_OUTPUTS
context_window: int = Field(default=DEFAULT_CONTEXT_WINDOW,description="The maximum number of context tokens for the model.",gt=0,)
model: str = Field(default=DEFAULT_MODEL, description="The ChatGlM model to use. glm-4 or glm-3-turbo")
api_key: str = Field(default=None, description="The ChatGLM API key.")
reuse_client: bool = Field(default=True, description=(
"Reuse the client between requests. When doing anything with large "
"volumes of async API calls, setting this to false can improve stability."
),
)
_client: Optional[Any] = PrivateAttr()
def __init__(
self,
model: str = DEFAULT_MODEL,
reuse_client: bool = True,
api_key: Optional[str] = None,
**kwargs: Any,
)-> None:
super().__init__(
model=model,
api_key=api_key,
reuse_client=reuse_client,
**kwargs,
)
self._client = None
def _get_client(self) -> ZhipuAI:
if not self.reuse_client :
return ZhipuAI(api_key=self.api_key)
if self._client is None:
self._client = ZhipuAI(api_key=self.api_key)
return self._client
@classmethod
def class_name(cls) -> str:
return "chatglm_llm"
@property
def metadata(self) -> LLMMetadata:
"""Get LLM metadata."""
return LLMMetadata(
context_window=self.context_window,
num_output=self.num_output,
model_name=self.model,
)
def _chat(self, messages:List, stream=False) -> Any:
response = self._get_client().chat.completions.create(
model=self.model, # 填写需要调用的模型名称
messages=messages,
)
return response
#@llm_chat_callback()
def chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponse:
message_dicts: List = to_message_dicts(messages)
response = self._chat(message_dicts, stream=False)
rsp = ChatResponse(
message=ChatMessage(content=response.choices[0].message.content, role=MessageRole(response.choices[0].message.role),
additional_kwargs= {}),
raw=response, additional_kwargs= get_additional_kwargs(response),
)
print(f"chat: {rsp} ")
return rsp
#@llm_chat_callback()
def stream_chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> CompletionResponseGen:
response_txt = ""
message_dicts: List = to_message_dicts(messages)
response = self._chat(message_dicts, stream=True)
for chunk in response:
token = chunk.choices[0].delta.content
response_txt += token
yield ChatResponse(message=ChatMessage(content=response_txt,role=MessageRole(message.get("role")),
additional_kwargs={},), delta=token, raw=chunk,)
@llm_completion_callback()
def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
messages = [{"role": "user", "content": prompt}]
try:
response = self._chat(messages, stream=False)
rsp = CompletionResponse(text=str(response.choices[0].message.content),
raw=response,
additional_kwargs=get_additional_kwargs(response),)
except Exception as e:
print(f"complete: exception {e}")
return rsp
@llm_completion_callback()
def stream_complete(self, prompt: str, **kwargs: Any) -> CompletionResponseGen:
response_txt = ""
messages = [{"role": "user", "content": prompt}]
response = self._chat(messages, stream=True)
CompletionResponse(text=response.choices[0].message.content, delta=response.choices[0].message)
for chunk in response.choices[0].message.content.splitlines():
try:
token = chunk+"\r\n"
except:
print(f"stream exception :{chunk}")
continue
response_txt += token
yield CompletionResponse(text=response_txt, delta=token)
完成LLM调用类的封装后,我们看ActionEngine类中,既需要LLM也需要emmbedding模型,但是我们不用LLM替换embedding 模型。Embedding model指的文本的表征向量,我们对embedding model的主要期望是能抓住文本的语义信息。有些模型如BERT专门做特定的训练以提升模型的语义理解能力。而LLM的主要任务是在做next token prediction,即循环输出合适的下一个单词。通常,LLM是在大规模文本数据集上做token prediction预训练,再fine-tuned以适配各种具体NLP任务,包括翻译、聊天机器人、Q&A等。这些任务的基本要求就是生成的文字是流畅的,因此,LLM主要focus on生成连贯的文本,对中间层embedding的语义约束变弱了。Decoder-only的LLM这个情况更明显。类比来说,就像我们上学时学物理一样,学物理(LLM)需要我们掌握一定的数学知识(文本的语义信息),但只学物理的话很难把数学考好。从神经网络训练的角度来说,训练LLM的training objective没有专门往文本语义信息上靠。通过继承Llama index的BaseEmbedding,实现embedding 模型的调用,代码如下:
class ChatGLMEmbeddings(BaseEmbedding):
model: str = Field(default='embedding-2', description="The ChatGlM model to use. embedding-2")
api_key: str = Field(default=None, description="The ChatGLM API key.")
reuse_client: bool = Field(default=True, description=(
"Reuse the client between requests. When doing anything with large "
"volumes of async API calls, setting this to false can improve stability."
),
)
_client: Optional[Any] = PrivateAttr()
def __init__(
self,
model: str = 'embedding-2',
reuse_client: bool = True,
api_key: Optional[str] = None,
**kwargs: Any,
)-> None:
super().__init__(
model=model,
api_key=api_key,
reuse_client=reuse_client,
**kwargs,
)
self._client = None
def _get_client(self) -> ZhipuAI:
if not self.reuse_client :
return ZhipuAI(api_key=self.api_key)
if self._client is None:
self._client = ZhipuAI(api_key=self.api_key)
return self._client
@classmethod
def class_name(cls) -> str:
return "ChatGLMEmbedding"
def _get_query_embedding(self, query: str) -> List[float]:
"""Get query embedding."""
return self.get_general_text_embedding(query)
async def _aget_query_embedding(self, query: str) -> List[float]:
"""The asynchronous version of _get_query_embedding."""
return self.get_general_text_embedding(query)
def _get_text_embedding(self, text: str) -> List[float]:
"""Get text embedding."""
return self.get_general_text_embedding(text)
async def _aget_text_embedding(self, text: str) -> List[float]:
"""Asynchronously get text embedding."""
return self.get_general_text_embedding(text)
def _get_text_embeddings(self, texts: List[str]) -> List[List[float]]:
"""Get text embeddings."""
embeddings_list: List[List[float]] = []
for text in texts:
embeddings = self.get_general_text_embedding(text)
embeddings_list.append(embeddings)
return embeddings_list
async def _aget_text_embeddings(self, texts: List[str]) -> List[List[float]]:
"""Asynchronously get text embeddings."""
return self._get_text_embeddings(texts)
def get_general_text_embedding(self, prompt: str) -> List[float]:
response = self._get_client().embeddings.create(
model=self.model, #填写需要调用的模型名称
input=prompt,
)
return response.data[0].embedding
chatGLM的levague
完成LLM和embedding模型的代码后,就需要模拟huggingface_lavague.py写一个chatGLM的chatglm_lavague.py,代码如下:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
@File : chatglm_lavague.py
@Time : 2024/03/18 16:33:29
@Author : CrissChan
@Version : 1.0
@Site : https://blog.csdn.net/crisschan
@Desc : None
'''
import locale
from chatglm import ChatGLM,ChatGLMEmbeddings
from llama_index.core import Document
from llama_index.core.node_parser import CodeSplitter
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core import VectorStoreIndex
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core import get_response_synthesizer
from llama_index.core import PromptTemplate
import gradio as gr
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
# Monkey patch because stream_complete is not implemented in the current version of llama_index
from llama_index.core.base.llms.types import (
CompletionResponse,
)
locale.getpreferredencoding = lambda: "UTF-8"
ZHIPU_API_KEY=<your_key>
# try:
# # from google.colab import userdata
# HF_TOKEN = userdata.get('HF_TOKEN')
# except:
# import os
# HF_TOKEN = os.environ["HF_TOKEN"]
# if not HF_TOKEN:
# from getpass import getpass
# HF_TOKEN = getpass('Enter your HF Token (https://huggingface.co/docs/hub/en/security-tokens): ')
model_id = "glm-4"
# max_new_tokens = 512
def stream_complete(
self, prompt: str, formatted: bool = False, **kwargs
):
def gen():
text = ""
for x in self._sync_client.text_generation(
prompt, **{**{"max_new_tokens": self.num_output, "stream": True}, **kwargs}
):
text += x
yield CompletionResponse(text=text, delta=x)
return gen()
## chatglm modle
llm = ChatGLM(model='glm-4', reuse_client=True, api_key=ZHIPU_API_KEY,)
embed_model = "embedding-2"
embedder = ChatGLMEmbeddings(model='embedding-2', reuse_client=True, api_key=ZHIPU_API_KEY,)
with open("prompt_template.txt", "r") as file:
PROMPT_TEMPLATE_STR = file.read()
# Preparing the action engine
MAX_CHARS = 1500
K = 3
class ActionEngine:
def __init__(self, llm, embedding):
self.llm = llm
self.embedding = embedding
def _get_index(self, html):
text_list = [html]
documents = [Document(text=t) for t in text_list]
splitter = CodeSplitter(
language="html",
chunk_lines=40, # lines per chunk
chunk_lines_overlap=200, # lines overlap between chunks
max_chars=MAX_CHARS, # max chars per chunk
)
nodes = splitter.get_nodes_from_documents(documents)
nodes = [node for node in nodes if node.text]
index = VectorStoreIndex(nodes, embed_model=self.embedding)
return index
def get_query_engine(self, state):
html = state
index = self._get_index(html)
retriever = BM25Retriever.from_defaults(
index=index,
similarity_top_k=K,
)
response_synthesizer = get_response_synthesizer(streaming=True, llm=self.llm)
# assemble query engine
query_engine = RetrieverQueryEngine(
retriever=retriever,
response_synthesizer=response_synthesizer,
)
prompt_template = PromptTemplate(PROMPT_TEMPLATE_STR)
query_engine.update_prompts(
{"response_synthesizer:text_qa_template": prompt_template}
)
return query_engine
# Code execution in action
MAX_CHARS = 1500
action_engine = ActionEngine(llm, embedder)
## Setup chrome options
chrome_options = Options()
chrome_options.add_argument("--headless") # Ensure GUI is off
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--window-size=1600,900")
title = """
<div align="center">
<h1> Welcome to ChatGLM-LaVague crisschan</h1>
<p>Redefining internet surfing by transforming natural language instructions into seamless browser interactions.</p>
</div>
"""
# Choose Chrome Browser
driver = webdriver.Chrome(service=webdriver_service, options=chrome_options)
# action_engine = ActionEngine(llm, embedder)
def process_url(url):
driver.get(url)
driver.save_screenshot("screenshot.png")
# This function is supposed to fetch and return the image from the URL.
# Placeholder function: replace with actual image fetching logic.
return "screenshot.png"
def process_instruction(query):
state = driver.page_source
query_engine = action_engine.get_query_engine(state)
streaming_response = query_engine.query(query)
source_nodes = streaming_response.get_formatted_sources(MAX_CHARS)
response = ""
for text in streaming_response.response_gen:
# do something with text as they arrive.
response += text
yield response, source_nodes
def exec_code(code):
#
# code = "#"+code.split("```")[1]
code = "#"+code.split("```")[1]
try:
# if len(code)==0:
# return "No code Generated"
# else:
exec(code)
print(code)
return "Successful code execution", code
except Exception as e:
output = f"Error in code execution: {str(e)}"
return output, code
def update_image_display(img):
driver.save_screenshot("screenshot.png")
url = driver.current_url
return "screenshot.png", url
def create_demo(base_url, instructions):
with gr.Blocks() as demo:
with gr.Row():
gr.HTML(title)
with gr.Row():
url_input = gr.Textbox(value=base_url, label="Enter URL and press 'Enter' to load the page.")
with gr.Row():
with gr.Column(scale=8):
image_display = gr.Image(label="Browser", interactive=False)
with gr.Column(scale=2):
text_area = gr.Textbox(label="Instructions")
gr.Examples(examples=instructions, inputs=text_area,
)
generate_btn = gr.Button(value="Execute")
code_display = gr.Code(label="Generated code", language="python",
lines=5, interactive=False)
with gr.Accordion(label="Logs", open=False) as log_accordion:
log_display = gr.Textbox(interactive=False)
source_display = gr.Textbox(label="Retrieved nodes", interactive=False)
# Linking components
url_input.submit(process_url, inputs=url_input, outputs=image_display)
generate_btn.click(process_instruction, inputs=text_area, outputs=[code_display, source_display]).then(
exec_code, inputs=code_display, outputs=[log_display, code_display]
).then(
update_image_display, inputs=image_display, outputs=[image_display, url_input]
)
demo.launch(share=False)
if __name__ == "__main__":
base_url = "https://bing.com/"
instructions = ["click 'Search the web, input 'crisschan' and press 'Enter'",
"click on the Copilot on the menu ",
"click ’有问题尽量问我‘, and input’hello’,then press ‘Enter’",
"Scroll by 500 pixels",]
create_demo(base_url, instructions)
需要替换成你自己的key。完成后,运行就可以看到如下gradio的页面了。
在url的后面按下回车,就会访问bing主页
然后选择已经写好的操作脚本后执行,等待一会(免费的就是速度慢,需要耐心等几分钟)。就可以在Generate Code的row里面看到生成的python selenium代码,并且完成最终页面的测试执行工作,左侧是预览图。
PS:免费薅的token有有效期,不用省着用。
总结
AIGC是当前LLM应用最为广泛的方式,LaVague是text生成了code,再经过执行生成的Code完成的动作的操作。这里面主要有两个问题,一个是所有的测试动作都是谓宾结构的描述,例如点击登录按钮,输入crisschan,按回车键等等,这些都是do something,这也就可以更好的识别动作到代码,通过这种do something的分析,撰写为生成测试代码而需要的prompt,然后通过LLM生成测试用的code。然后完成Code的执行,实现转译后到python测试脚本后代码执行工作,实现了从自然语言到动作的这样的生成,也就是text2action的AIGC。
至此,完成了LaVague的国内LLM平替。感兴趣就动手去试一试吧。