Skip to content

Commit 06c3025

Browse files
committed
New interpreter --os powered by Anthropic
1 parent 3ead0bd commit 06c3025

File tree

3 files changed

+75
-12
lines changed

3 files changed

+75
-12
lines changed

interpreter/computer_use/loop.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ class APIProvider(StrEnum):
9494
* If the item you are looking at is a pdf, if after taking a single screenshot of the pdf it seems that you want to read the entire document instead of trying to continue to read the pdf from your screenshots + navigation, determine the URL, use curl to download the pdf, install and use pdftotext to convert it to a text file, and then read that text file directly with your StrReplaceEditTool.
9595
</IMPORTANT>"""
9696

97-
X_SYSTEM_PROMPT = f"""<SYSTEM_CAPABILITY>
97+
SYSTEM_PROMPT = f"""<SYSTEM_CAPABILITY>
9898
* You are an AI assistant with access to a virtual machine running on {platform.machine()} architecture.
9999
* You have the capability to edit and run code.
100100
* You can use the bash tool to run shell commands, execute scripts, and manage files.
@@ -104,6 +104,18 @@ class APIProvider(StrEnum):
104104
* The current date is {datetime.today().strftime('%A, %B %-d, %Y')}.
105105
</SYSTEM_CAPABILITY>"""
106106

107+
SYSTEM_PROMPT = f"""<SYSTEM_CAPABILITY>
108+
* You are an AI assistant with access to a virtual machine running on {"Mac OS" if platform.system() == "Darwin" else platform.system()} with internet access.
109+
* When using your computer function calls, they take a while to run and send back to you. Where possible/feasible, try to chain multiple of these calls all into one function calls request.
110+
</SYSTEM_CAPABILITY>"""
111+
112+
# Update the SYSTEM_PROMPT for Mac OS
113+
if platform.system() == "Darwin":
114+
SYSTEM_PROMPT += """
115+
<IMPORTANT>
116+
* Open applications using Spotlight by using the computer tool to simulate pressing Command+Space, typing the application name, and pressing Enter.
117+
</IMPORTANT>"""
118+
107119

108120
async def sampling_loop(
109121
*,
@@ -122,8 +134,8 @@ async def sampling_loop(
122134
"""
123135
tool_collection = ToolCollection(
124136
ComputerTool(),
125-
BashTool(),
126-
EditTool(),
137+
# BashTool(),
138+
# EditTool(),
127139
)
128140
system = (
129141
f"{SYSTEM_PROMPT}{' ' + system_prompt_suffix if system_prompt_suffix else ''}"
@@ -227,7 +239,7 @@ async def sampling_loop(
227239
def _maybe_filter_to_n_most_recent_images(
228240
messages: list[BetaMessageParam],
229241
images_to_keep: int,
230-
min_removal_threshold: int = 10,
242+
min_removal_threshold: int = 5,
231243
):
232244
"""
233245
With the assumption that images are screenshots that are of diminishing value as
@@ -353,6 +365,9 @@ async def main():
353365
**Warning:** In this mode, Open Interpreter will not require approval before performing actions. Move your mouse to any corner of the screen to exit.
354366
"""
355367

368+
markdown_text = f"""> Model set to `Claude 3.5` OS control enabled
369+
"""
370+
356371
print_markdown(markdown_text)
357372

358373
# Start the mouse position checking thread

interpreter/computer_use/tools/computer.py

Lines changed: 55 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
import asyncio
22
import base64
3+
import math
34
import os
45
import platform
56
import shlex
67
import shutil
8+
import tempfile
9+
import time
710
from enum import StrEnum
811
from pathlib import Path
912
from typing import Literal, TypedDict
@@ -64,6 +67,30 @@ def chunks(s: str, chunk_size: int) -> list[str]:
6467
return [s[i : i + chunk_size] for i in range(0, len(s), chunk_size)]
6568

6669

70+
def smooth_move_to(x, y, duration=1.2):
71+
start_x, start_y = pyautogui.position()
72+
dx = x - start_x
73+
dy = y - start_y
74+
distance = math.hypot(dx, dy) # Calculate the distance in pixels
75+
76+
start_time = time.time()
77+
78+
while True:
79+
elapsed_time = time.time() - start_time
80+
if elapsed_time > duration:
81+
break
82+
83+
t = elapsed_time / duration
84+
eased_t = (1 - math.cos(t * math.pi)) / 2 # easeInOutSine function
85+
86+
target_x = start_x + dx * eased_t
87+
target_y = start_y + dy * eased_t
88+
pyautogui.moveTo(target_x, target_y)
89+
90+
# Ensure the mouse ends up exactly at the target (x, y)
91+
pyautogui.moveTo(x, y)
92+
93+
6794
class ComputerTool(BaseAnthropicTool):
6895
"""
6996
A tool that allows the agent to interact with the screen, keyboard, and mouse of the current computer.
@@ -123,8 +150,9 @@ async def __call__(
123150
)
124151

125152
if action == "mouse_move":
126-
pyautogui.moveTo(x, y)
153+
smooth_move_to(x, y)
127154
elif action == "left_click_drag":
155+
smooth_move_to(x, y)
128156
pyautogui.dragTo(x, y, button="left")
129157

130158
elif action in ("key", "type"):
@@ -136,20 +164,38 @@ async def __call__(
136164
text = text.replace("super+", "command+")
137165
keys = text.split("+")
138166
if len(keys) > 1:
139-
pyautogui.hotkey(*keys)
167+
if "darwin" in platform.system().lower():
168+
# Use AppleScript for hotkey on macOS
169+
keystroke, modifier = (keys[-1], "+".join(keys[:-1]))
170+
modifier = modifier.lower() + " down"
171+
if keystroke.lower() == "space":
172+
keystroke = " "
173+
elif keystroke.lower() == "enter":
174+
keystroke = "\n"
175+
script = f"""
176+
tell application "System Events"
177+
keystroke "{keystroke}" using {modifier}
178+
end tell
179+
"""
180+
os.system("osascript -e '{}'".format(script))
181+
else:
182+
pyautogui.hotkey(*keys)
140183
else:
141184
pyautogui.press(text)
142185
elif action == "type":
143186
pyautogui.write(text, interval=TYPING_DELAY_MS / 1000)
144187

145188
elif action in ("left_click", "right_click", "double_click", "middle_click"):
189+
time.sleep(0.1)
146190
button = {
147191
"left_click": "left",
148192
"right_click": "right",
149193
"middle_click": "middle",
150194
}
151195
if action == "double_click":
152-
pyautogui.doubleClick()
196+
pyautogui.click()
197+
time.sleep(0.1)
198+
pyautogui.click()
153199
else:
154200
pyautogui.click(button=button.get(action, "left"))
155201

@@ -170,9 +216,9 @@ async def __call__(
170216

171217
async def screenshot(self):
172218
"""Take a screenshot of the current screen and return the base64 encoded image."""
173-
output_dir = Path(OUTPUT_DIR)
174-
output_dir.mkdir(parents=True, exist_ok=True)
175-
path = output_dir / f"screenshot_{uuid4().hex}.png"
219+
# Use a user-writable directory for temporary files
220+
temp_dir = Path(tempfile.gettempdir())
221+
path = temp_dir / f"screenshot_{uuid4().hex}.png"
176222

177223
screenshot = pyautogui.screenshot()
178224
screenshot.save(str(path))
@@ -186,7 +232,9 @@ async def screenshot(self):
186232
)
187233

188234
if path.exists():
189-
return ToolResult(base64_image=base64.b64encode(path.read_bytes()).decode())
235+
base64_image = base64.b64encode(path.read_bytes()).decode()
236+
path.unlink() # Remove the temporary file
237+
return ToolResult(base64_image=base64_image)
190238
raise ToolError(f"Failed to take screenshot")
191239

192240
async def shell(self, command: str, take_screenshot=True) -> ToolResult:

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ packages = [
44
{include = "interpreter"},
55
{include = "scripts"},
66
]
7-
version = "0.3.14" # Use "-rc1", "-rc2", etc. for pre-release versions
7+
version = "0.4.0" # Use "-rc1", "-rc2", etc. for pre-release versions
88
description = "Let language models run code"
99
authors = ["Killian Lucas <[email protected]>"]
1010
readme = "README.md"

0 commit comments

Comments
 (0)