Spaces:
Runtime error
Runtime error
| from openai import OpenAI | |
| from computer_use_demo.gui_agent.llm_utils.oai import encode_image | |
| _NAV_SYSTEM_GROUNDING = """ | |
| You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. | |
| ## Output Format | |
| ```Action: ...``` | |
| ## Action Space | |
| click(start_box='<|box_start|>(x1,y1)<|box_end|>') | |
| hotkey(key='') | |
| type(content='') #If you want to submit your input, use \"\" at the end of `content`. | |
| scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left') | |
| wait() #Sleep for 5s and take a screenshot to check for any changes. | |
| finished() | |
| call_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help. | |
| ## Note | |
| - Do not generate any other text. | |
| """ | |
| def get_prompt_grounding(task): | |
| return f"""{task}""" | |
| task = """ | |
| ```json | |
| {{ "Observation": "I am on the google homepage of the Chrome browser.", | |
| "Thinking": "The user wants to buy a lap-top on Amazon.com, so I need to click on the address (search) bar of Chrome for entering the 'Amazon.com'.", | |
| "Next Action": ["I need to click DSML"], | |
| "Expectation": "The search button is activated after being clicked, ready to input." | |
| }}``` | |
| """ | |
| task = """ | |
| ```json | |
| {{ | |
| "Observation": "I am on the google homepage of the Chrome browser.", | |
| "Thinking": "The user wants to click DSML", | |
| "Next Action": ["I need to click DSML"], | |
| }}``` | |
| """ | |
| task = """ | |
| ```json | |
| {{ | |
| "Observation": "I am on the google homepage of the Chrome browser.", | |
| "Thinking": "The user wants to click Youtube", | |
| "Next Action": ["I need to click Youtube"], | |
| }}``` | |
| """ | |
| if __name__ == "__main__": | |
| ui_tars_url = "https://your_api_to_uitars.com/v1" | |
| ui_tars_client = OpenAI(base_url=ui_tars_url, api_key="") | |
| grounding_system_prompt = _NAV_SYSTEM_GROUNDING.format() | |
| screenshot_base64 = encode_image("./chrome.png") | |
| prompted_message = get_prompt_grounding(task) | |
| print(f"grounding_system_prompt, {grounding_system_prompt}, \ | |
| prompted_message: {prompted_message}") | |
| response = ui_tars_client.chat.completions.create( | |
| model="ui-tars", | |
| messages=[ | |
| {"role": "user", "content": grounding_system_prompt}, | |
| {"role": "user", "content": [ | |
| {"type": "text", "text": prompted_message}, | |
| {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{screenshot_base64}"}} | |
| ] | |
| }, | |
| ], | |
| max_tokens=128, | |
| temperature=0 | |
| ) | |
| ui_tars_action = response.choices[0].message.content | |
| print(response.choices[0].message.content) | |