| | import argparse |
| | from io import BytesIO |
| | from time import sleep |
| |
|
| | import helium |
| | import PIL.Image |
| | from dotenv import load_dotenv |
| | from selenium import webdriver |
| | from selenium.webdriver.common.by import By |
| | from selenium.webdriver.common.keys import Keys |
| |
|
| | from smolagents import CodeAgent, WebSearchTool, tool |
| | from smolagents.agents import ActionStep |
| | from smolagents.cli import load_model |
| |
|
| |
|
| | github_request = """ |
| | I'm trying to find how hard I have to work to get a repo in github.com/trending. |
| | Can you navigate to the profile for the top author of the top trending repo, and give me their total number of commits over the last year? |
| | """ |
| |
|
| | search_request = """ |
| | Please navigate to https://en.wikipedia.org/wiki/Chicago and give me a sentence containing the word "1992" that mentions a construction accident. |
| | """ |
| |
|
| |
|
| | def parse_arguments(): |
| | parser = argparse.ArgumentParser(description="Run a web browser automation script with a specified model.") |
| | parser.add_argument( |
| | "prompt", |
| | type=str, |
| | nargs="?", |
| | default=search_request, |
| | help="The prompt to run with the agent", |
| | ) |
| | parser.add_argument( |
| | "--model-type", |
| | type=str, |
| | default="LiteLLMModel", |
| | help="The model type to use (e.g., OpenAIServerModel, LiteLLMModel, TransformersModel, InferenceClientModel)", |
| | ) |
| | parser.add_argument( |
| | "--model-id", |
| | type=str, |
| | default="gpt-4o", |
| | help="The model ID to use for the specified model type", |
| | ) |
| | parser.add_argument( |
| | "--provider", |
| | type=str, |
| | help="The inference provider to use for the model", |
| | ) |
| | parser.add_argument( |
| | "--api-base", |
| | type=str, |
| | help="The API base to use for the model", |
| | ) |
| | parser.add_argument( |
| | "--api-key", |
| | type=str, |
| | help="The API key to use for the model", |
| | ) |
| | return parser.parse_args() |
| |
|
| |
|
| | def save_screenshot(memory_step: ActionStep, agent: CodeAgent) -> None: |
| | sleep(1.0) |
| | driver = helium.get_driver() |
| | current_step = memory_step.step_number |
| | if driver is not None: |
| | for previous_memory_step in agent.memory.steps: |
| | if isinstance(previous_memory_step, ActionStep) and previous_memory_step.step_number <= current_step - 2: |
| | previous_memory_step.observations_images = None |
| | png_bytes = driver.get_screenshot_as_png() |
| | image = PIL.Image.open(BytesIO(png_bytes)) |
| | print(f"Captured a browser screenshot: {image.size} pixels") |
| | memory_step.observations_images = [image.copy()] |
| |
|
| | |
| | url_info = f"Current url: {driver.current_url}" |
| | memory_step.observations = ( |
| | url_info if memory_step.observations is None else memory_step.observations + "\n" + url_info |
| | ) |
| | return |
| |
|
| |
|
| | @tool |
| | def search_item_ctrl_f(text: str, nth_result: int = 1) -> str: |
| | """ |
| | Searches for text on the current page via Ctrl + F and jumps to the nth occurrence. |
| | Args: |
| | text: The text to search for |
| | nth_result: Which occurrence to jump to (default: 1) |
| | """ |
| | elements = driver.find_elements(By.XPATH, f"//*[contains(text(), '{text}')]") |
| | if nth_result > len(elements): |
| | raise Exception(f"Match n°{nth_result} not found (only {len(elements)} matches found)") |
| | result = f"Found {len(elements)} matches for '{text}'." |
| | elem = elements[nth_result - 1] |
| | driver.execute_script("arguments[0].scrollIntoView(true);", elem) |
| | result += f"Focused on element {nth_result} of {len(elements)}" |
| | return result |
| |
|
| |
|
| | @tool |
| | def go_back() -> None: |
| | """Goes back to previous page.""" |
| | driver.back() |
| |
|
| |
|
| | @tool |
| | def close_popups() -> str: |
| | """ |
| | Closes any visible modal or pop-up on the page. Use this to dismiss pop-up windows! This does not work on cookie consent banners. |
| | """ |
| | webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform() |
| |
|
| |
|
| | def initialize_driver(): |
| | """Initialize the Selenium WebDriver.""" |
| | chrome_options = webdriver.ChromeOptions() |
| | chrome_options.add_argument("--force-device-scale-factor=1") |
| | chrome_options.add_argument("--window-size=1000,1350") |
| | chrome_options.add_argument("--disable-pdf-viewer") |
| | chrome_options.add_argument("--window-position=0,0") |
| | return helium.start_chrome(headless=False, options=chrome_options) |
| |
|
| |
|
| | def initialize_agent(model): |
| | """Initialize the CodeAgent with the specified model.""" |
| | return CodeAgent( |
| | tools=[WebSearchTool(), go_back, close_popups, search_item_ctrl_f], |
| | model=model, |
| | additional_authorized_imports=["helium"], |
| | step_callbacks=[save_screenshot], |
| | max_steps=20, |
| | verbosity_level=2, |
| | ) |
| |
|
| |
|
| | helium_instructions = """ |
| | Use your web_search tool when you want to get Google search results. |
| | Then you can use helium to access websites. Don't use helium for Google search, only for navigating websites! |
| | Don't bother about the helium driver, it's already managed. |
| | We've already ran "from helium import *" |
| | Then you can go to pages! |
| | <code> |
| | go_to('github.com/trending') |
| | </code> |
| | |
| | You can directly click clickable elements by inputting the text that appears on them. |
| | <code> |
| | click("Top products") |
| | </code> |
| | |
| | If it's a link: |
| | <code> |
| | click(Link("Top products")) |
| | </code> |
| | |
| | If you try to interact with an element and it's not found, you'll get a LookupError. |
| | In general stop your action after each button click to see what happens on your screenshot. |
| | Never try to login in a page. |
| | |
| | To scroll up or down, use scroll_down or scroll_up with as an argument the number of pixels to scroll from. |
| | <code> |
| | scroll_down(num_pixels=1200) # This will scroll one viewport down |
| | </code> |
| | |
| | When you have pop-ups with a cross icon to close, don't try to click the close icon by finding its element or targeting an 'X' element (this most often fails). |
| | Just use your built-in tool `close_popups` to close them: |
| | <code> |
| | close_popups() |
| | </code> |
| | |
| | You can use .exists() to check for the existence of an element. For example: |
| | <code> |
| | if Text('Accept cookies?').exists(): |
| | click('I accept') |
| | </code> |
| | |
| | Proceed in several steps rather than trying to solve the task in one shot. |
| | And at the end, only when you have your answer, return your final answer. |
| | <code> |
| | final_answer("YOUR_ANSWER_HERE") |
| | </code> |
| | |
| | If pages seem stuck on loading, you might have to wait, for instance `import time` and run `time.sleep(5.0)`. But don't overuse this! |
| | To list elements on page, DO NOT try code-based element searches like 'contributors = find_all(S("ol > li"))': just look at the latest screenshot you have and read it visually, or use your tool search_item_ctrl_f. |
| | Of course, you can act on buttons like a user would do when navigating. |
| | After each code blob you write, you will be automatically provided with an updated screenshot of the browser and the current browser url. |
| | But beware that the screenshot will only be taken at the end of the whole action, it won't see intermediate states. |
| | Don't kill the browser. |
| | When you have modals or cookie banners on screen, you should get rid of them before you can click anything else. |
| | """ |
| |
|
| |
|
| | def run_webagent( |
| | prompt: str, |
| | model_type: str, |
| | model_id: str, |
| | provider: str | None = None, |
| | api_base: str | None = None, |
| | api_key: str | None = None, |
| | ) -> None: |
| | |
| | load_dotenv() |
| |
|
| | |
| | model = load_model(model_type, model_id, provider=provider, api_base=api_base, api_key=api_key) |
| |
|
| | global driver |
| | driver = initialize_driver() |
| | agent = initialize_agent(model) |
| |
|
| | |
| | agent.python_executor("from helium import *") |
| | agent.run(prompt + helium_instructions) |
| |
|
| |
|
| | def main() -> None: |
| | |
| | args = parse_arguments() |
| | run_webagent(args.prompt, args.model_type, args.model_id, args.provider, args.api_base, args.api_key) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|