diff --git a/daemon/cmd/cremotedaemon/main.go b/daemon/cmd/cremotedaemon/main.go index 06b02f5..f5ff5b2 100644 --- a/daemon/cmd/cremotedaemon/main.go +++ b/daemon/cmd/cremotedaemon/main.go @@ -14,13 +14,16 @@ import ( var ( daemonHost = flag.String("listen", "localhost", "Listen address") port = flag.Int("port", 8989, "Listen port") + debug = flag.Bool("debug", false, "Enable debug logging") ) func main() { flag.Parse() + log.Printf("Starting cremote daemon on %s:%d (debug: %v)", *daemonHost, *port, *debug) + // Create and start the daemon - d, err := daemon.NewDaemon(*daemonHost, *port) + d, err := daemon.NewDaemon(*daemonHost, *port, *debug) if err != nil { log.Fatalf("Failed to create daemon: %v", err) } diff --git a/daemon/daemon.go b/daemon/daemon.go index d334988..dbbb6c6 100644 --- a/daemon/daemon.go +++ b/daemon/daemon.go @@ -26,6 +26,7 @@ type Daemon struct { currentTab string // ID of the current/last used tab tabHistory []string // Stack of tab IDs in order of activation (LIFO) consoleLogs map[string][]ConsoleLog // Maps tab ID to console logs + debug bool // Enable debug logging mu sync.Mutex server *http.Server } @@ -71,11 +72,26 @@ func checkChromeDevTools(port int) bool { return resp.StatusCode == 200 } +// debugLog logs a message only if debug mode is enabled +func (d *Daemon) debugLog(format string, args ...interface{}) { + if d.debug { + log.Printf("[DEBUG] "+format, args...) + } +} + // NewDaemon creates a new daemon instance -func NewDaemon(host string, port int) (*Daemon, error) { +func NewDaemon(host string, port int, debug bool) (*Daemon, error) { + if debug { + log.Printf("[DEBUG] Creating new daemon on %s:%d", host, port) + } + // Check if Chrome is running on the debug port chromePort := 9222 // Default Chrome debug port + if debug { + log.Printf("[DEBUG] Checking if Chrome is running on port %d", chromePort) + } + if !checkChromeRunning(chromePort) { return nil, fmt.Errorf("Chromium is not running with remote debugging enabled on port %d.\n\nTo start Chromium with remote debugging:\n chromium --remote-debugging-port=%d --user-data-dir=/tmp/chromium-debug &\n # or\n google-chrome --remote-debugging-port=%d --user-data-dir=/tmp/chrome-debug &\n\nNote: The --user-data-dir flag is required to avoid conflicts with existing browser instances.", chromePort, chromePort, chromePort) } @@ -94,15 +110,23 @@ func NewDaemon(host string, port int) (*Daemon, error) { return nil, fmt.Errorf("Chromium DevTools is responding on port %d but rod connection failed: %w\n\nThis is unexpected. Try restarting Chromium with:\n chromium --remote-debugging-port=%d --user-data-dir=/tmp/chromium-debug &", chromePort, err, chromePort) } + if debug { + log.Printf("[DEBUG] Successfully connected to browser via rod") + } + daemon := &Daemon{ browser: browser, tabs: make(map[string]*rod.Page), iframePages: make(map[string]*rod.Page), tabHistory: make([]string, 0), consoleLogs: make(map[string][]ConsoleLog), + debug: debug, } + daemon.debugLog("Daemon struct initialized") + // Create HTTP server + daemon.debugLog("Setting up HTTP server") mux := http.NewServeMux() mux.HandleFunc("/command", daemon.handleCommand) mux.HandleFunc("/status", daemon.handleStatus) @@ -114,13 +138,18 @@ func NewDaemon(host string, port int) (*Daemon, error) { Handler: mux, } + daemon.debugLog("HTTP server configured on %s:%d", host, port) + return daemon, nil } // Start starts the daemon server func (d *Daemon) Start() error { log.Printf("Starting daemon server on %s", d.server.Addr) - return d.server.ListenAndServe() + d.debugLog("About to call ListenAndServe()") + err := d.server.ListenAndServe() + d.debugLog("ListenAndServe() returned with error: %v", err) + return err } // Stop stops the daemon server @@ -173,7 +202,10 @@ func (d *Daemon) handleStatus(w http.ResponseWriter, r *http.Request) { // handleCommand handles command requests func (d *Daemon) handleCommand(w http.ResponseWriter, r *http.Request) { + d.debugLog("Received HTTP request: %s %s", r.Method, r.URL.Path) + if r.Method != http.MethodPost { + d.debugLog("Invalid method: %s", r.Method) http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) return } @@ -181,10 +213,13 @@ func (d *Daemon) handleCommand(w http.ResponseWriter, r *http.Request) { var cmd Command err := json.NewDecoder(r.Body).Decode(&cmd) if err != nil { + d.debugLog("Failed to decode JSON: %v", err) http.Error(w, "Invalid request body", http.StatusBadRequest) return } + d.debugLog("Processing command: %s with params: %+v", cmd.Action, cmd.Params) + var response Response switch cmd.Action { @@ -510,20 +545,25 @@ func (d *Daemon) handleCommand(w http.ResponseWriter, r *http.Request) { } default: + d.debugLog("Unknown action: %s", cmd.Action) response = Response{Success: false, Error: "Unknown action"} } + d.debugLog("Command %s completed, sending response: success=%v", cmd.Action, response.Success) w.Header().Set("Content-Type", "application/json") json.NewEncoder(w).Encode(response) + d.debugLog("Response sent for command: %s", cmd.Action) } // openTab opens a new tab and returns its ID func (d *Daemon) openTab(timeout int) (string, error) { + d.debugLog("Opening new tab with timeout: %d", timeout) d.mu.Lock() defer d.mu.Unlock() // Create a context with timeout if specified if timeout > 0 { + d.debugLog("Using timeout context: %d seconds", timeout) ctx, cancel := context.WithTimeout(context.Background(), time.Duration(timeout)*time.Second) defer cancel() @@ -768,10 +808,13 @@ func (d *Daemon) closeTab(tabID string, timeout int) error { // loadURL loads a URL in a tab func (d *Daemon) loadURL(tabID, url string, timeout int) error { + d.debugLog("Loading URL: %s in tab: %s with timeout: %d", url, tabID, timeout) page, err := d.getTab(tabID) if err != nil { + d.debugLog("Failed to get tab %s: %v", tabID, err) return err } + d.debugLog("Got tab %s, starting navigation", tabID) if timeout > 0 { // Use timeout for the URL loading diff --git a/mcp/LLM_MCP_GUIDE.md b/mcp/LLM_MCP_GUIDE.md deleted file mode 100644 index cc8f088..0000000 --- a/mcp/LLM_MCP_GUIDE.md +++ /dev/null @@ -1,642 +0,0 @@ -# LLM Agent Guide: Using Cremote MCP Server for Web Automation - -This document provides comprehensive guidance for LLM agents on how to use the **Cremote MCP Server** for intelligent web automation. The MCP server provides a structured, stateful interface that's optimized for AI-driven web testing and automation workflows. - -## What is the Cremote MCP Server? - -The **Cremote MCP Server** is a Model Context Protocol implementation that wraps cremote's web automation capabilities in a structured API designed specifically for LLMs. Unlike CLI commands, the MCP server provides: - -- **Automatic State Management**: Tracks current tab, tab history, and iframe context -- **Intelligent Abstractions**: High-level tools that combine multiple operations -- **Rich Error Context**: Detailed error information for better debugging -- **Automatic Screenshots**: Built-in screenshot capture for documentation -- **Structured Responses**: Consistent, parseable JSON responses - -## Prerequisites - -Before using the MCP server, ensure the cremote infrastructure is running: - -1. **Check if everything is already running:** - ```bash - cremote status - ``` - -2. **Start Chromium with remote debugging (if needed):** - ```bash - chromium --remote-debugging-port=9222 --user-data-dir=/tmp/chromium-debug & - ``` - -3. **Start cremote daemon (if needed):** - ```bash - cremotedaemon & - ``` - -4. **The MCP server should be configured in your MCP client** (e.g., Claude Desktop) - -## Available MCP Tools - -### 1. `web_navigate` - Smart Navigation - -Navigate to URLs with automatic tab management and optional screenshot capture. - -**Parameters:** -- `url` (required): URL to navigate to -- `tab` (optional): Specific tab ID (uses current tab if not specified) -- `screenshot` (optional): Take screenshot after navigation (default: false) -- `timeout` (optional): Timeout in seconds (default: 5) - -**Example:** -```json -{ - "name": "web_navigate", - "arguments": { - "url": "https://example.com/login", - "screenshot": true, - "timeout": 10 - } -} -``` - -**Smart Behavior:** -- Automatically opens a new tab if none exists -- Updates current tab tracking -- Adds tab to history for easy switching - -### 2. `web_interact` - Element Interactions - -Interact with web elements through a unified interface. - -**Parameters:** -- `action` (required): "click", "fill", "submit", or "upload" -- `selector` (required): CSS selector for the target element -- `value` (optional): Value for fill/upload actions -- `tab` (optional): Tab ID (uses current tab if not specified) -- `timeout` (optional): Timeout in seconds (default: 5) - -**Examples:** -```json -// Fill a form field -{ - "name": "web_interact", - "arguments": { - "action": "fill", - "selector": "#username", - "value": "testuser" - } -} - -// Click a button -{ - "name": "web_interact", - "arguments": { - "action": "click", - "selector": "#login-button" - } -} - -// Submit a form -{ - "name": "web_interact", - "arguments": { - "action": "submit", - "selector": "form#login-form" - } -} - -// Upload a file -{ - "name": "web_interact", - "arguments": { - "action": "upload", - "selector": "input[type=file]", - "value": "/path/to/file.pdf" - } -} -``` - -### 3. `web_extract` - Data Extraction - -Extract information from web pages through multiple methods. - -**Parameters:** -- `type` (required): "source", "element", or "javascript" -- `selector` (optional): CSS selector (required for "element" type) -- `code` (optional): JavaScript code (required for "javascript" type) -- `tab` (optional): Tab ID (uses current tab if not specified) -- `timeout` (optional): Timeout in seconds (default: 5) - -**Examples:** -```json -// Get page source -{ - "name": "web_extract", - "arguments": { - "type": "source" - } -} - -// Get specific element HTML -{ - "name": "web_extract", - "arguments": { - "type": "element", - "selector": ".error-message" - } -} - -// Execute JavaScript and get result -{ - "name": "web_extract", - "arguments": { - "type": "javascript", - "code": "document.title" - } -} - -// Check form validation -{ - "name": "web_extract", - "arguments": { - "type": "javascript", - "code": "document.getElementById('email').validity.valid" - } -} -``` - -### 4. `web_screenshot` - Screenshot Capture - -Take screenshots for documentation and debugging. - -**Parameters:** -- `output` (required): File path for the screenshot -- `full_page` (optional): Capture full page vs viewport (default: false) -- `tab` (optional): Tab ID (uses current tab if not specified) -- `timeout` (optional): Timeout in seconds (default: 5) - -**Examples:** -```json -// Viewport screenshot -{ - "name": "web_screenshot", - "arguments": { - "output": "/tmp/login-page.png" - } -} - -// Full page screenshot -{ - "name": "web_screenshot", - "arguments": { - "output": "/tmp/full-page.png", - "full_page": true - } -} -``` - -### 5. `web_manage_tabs` - Tab Management - -Manage browser tabs with automatic state tracking. - -**Parameters:** -- `action` (required): "open", "close", "list", or "switch" -- `tab` (optional): Tab ID (required for "close" and "switch" actions) -- `timeout` (optional): Timeout in seconds (default: 5) - -**Examples:** -```json -// Open new tab -{ - "name": "web_manage_tabs", - "arguments": { - "action": "open" - } -} - -// List all tabs -{ - "name": "web_manage_tabs", - "arguments": { - "action": "list" - } -} - -// Switch to specific tab -{ - "name": "web_manage_tabs", - "arguments": { - "action": "switch", - "tab": "tab-id-123" - } -} - -// Close current tab -{ - "name": "web_manage_tabs", - "arguments": { - "action": "close" - } -} -``` - -### 6. `web_iframe` - Iframe Context Management - -Switch between main page and iframe contexts for testing embedded content. - -**Parameters:** -- `action` (required): "enter" or "exit" -- `selector` (optional): Iframe CSS selector (required for "enter" action) -- `tab` (optional): Tab ID (uses current tab if not specified) - -**Examples:** -```json -// Enter iframe context -{ - "name": "web_iframe", - "arguments": { - "action": "enter", - "selector": "iframe#payment-form" - } -} - -// Exit iframe context (return to main page) -{ - "name": "web_iframe", - "arguments": { - "action": "exit" - } -} -``` - -## Response Format - -All MCP tools return a consistent response structure: - -```json -{ - "success": true, - "data": "...", // Tool-specific response data - "screenshot": "/tmp/shot.png", // Screenshot path (if captured) - "current_tab": "tab-id-123", // Current active tab - "tab_history": ["tab-id-123"], // Tab history stack - "iframe_mode": false, // Whether in iframe context - "error": null, // Error message (if failed) - "metadata": {} // Additional context information -} -``` - -## Common Automation Patterns - -### 1. Login Flow Testing - -```json -// 1. Navigate to login page with screenshot -{ - "name": "web_navigate", - "arguments": { - "url": "https://myapp.com/login", - "screenshot": true - } -} - -// 2. Fill credentials -{ - "name": "web_interact", - "arguments": { - "action": "fill", - "selector": "#email", - "value": "user@example.com" - } -} - -{ - "name": "web_interact", - "arguments": { - "action": "fill", - "selector": "#password", - "value": "password123" - } -} - -// 3. Submit login -{ - "name": "web_interact", - "arguments": { - "action": "click", - "selector": "#login-button" - } -} - -// 4. Verify success -{ - "name": "web_extract", - "arguments": { - "type": "javascript", - "code": "document.querySelector('.welcome-message')?.textContent" - } -} - -// 5. Document result -{ - "name": "web_screenshot", - "arguments": { - "output": "/tmp/login-success.png" - } -} -``` - -### 2. Form Validation Testing - -```json -// 1. Navigate to form -{ - "name": "web_navigate", - "arguments": { - "url": "https://myapp.com/register" - } -} - -// 2. Test empty form submission -{ - "name": "web_interact", - "arguments": { - "action": "click", - "selector": "#submit-button" - } -} - -// 3. Check for validation errors -{ - "name": "web_extract", - "arguments": { - "type": "element", - "selector": ".error-message" - } -} - -// 4. Test invalid email -{ - "name": "web_interact", - "arguments": { - "action": "fill", - "selector": "#email", - "value": "invalid-email" - } -} - -// 5. Verify JavaScript validation -{ - "name": "web_extract", - "arguments": { - "type": "javascript", - "code": "document.getElementById('email').validity.valid" - } -} -``` - -### 3. Multi-Tab Workflow - -```json -// 1. Open multiple tabs for comparison -{ - "name": "web_manage_tabs", - "arguments": { - "action": "open" - } -} - -{ - "name": "web_navigate", - "arguments": { - "url": "https://app.com/admin" - } -} - -{ - "name": "web_manage_tabs", - "arguments": { - "action": "open" - } -} - -{ - "name": "web_navigate", - "arguments": { - "url": "https://app.com/user" - } -} - -// 2. List tabs to see current state -{ - "name": "web_manage_tabs", - "arguments": { - "action": "list" - } -} - -// 3. Switch between tabs as needed -{ - "name": "web_manage_tabs", - "arguments": { - "action": "switch", - "tab": "first-tab-id" - } -} -``` - -### 4. Iframe Testing (Payment Forms, Widgets) - -```json -// 1. Navigate to page with iframe -{ - "name": "web_navigate", - "arguments": { - "url": "https://shop.com/checkout" - } -} - -// 2. Enter iframe context -{ - "name": "web_iframe", - "arguments": { - "action": "enter", - "selector": "iframe.payment-frame" - } -} - -// 3. Interact with iframe content -{ - "name": "web_interact", - "arguments": { - "action": "fill", - "selector": "#card-number", - "value": "4111111111111111" - } -} - -{ - "name": "web_interact", - "arguments": { - "action": "fill", - "selector": "#expiry", - "value": "12/25" - } -} - -// 4. Exit iframe context -{ - "name": "web_iframe", - "arguments": { - "action": "exit" - } -} - -// 5. Continue with main page -{ - "name": "web_interact", - "arguments": { - "action": "click", - "selector": "#complete-order" - } -} -``` - -## Best Practices for LLMs - -### 1. State Awareness -- The MCP server automatically tracks state, but always check the response for current context -- Use the `current_tab` and `iframe_mode` fields to understand your current position -- The `tab_history` helps you understand available tabs - -### 2. Error Handling -- Always check the `success` field in responses -- Use the `error` field for detailed error information -- Take screenshots when errors occur for debugging: `"screenshot": true` - -### 3. Timeout Management -- Use longer timeouts for slow-loading pages or complex interactions -- Default 5-second timeouts work for most scenarios -- Increase timeouts for file uploads or heavy JavaScript applications - -### 4. Screenshot Strategy -- Take screenshots at key points for documentation -- Use `full_page: true` for comprehensive page captures -- Screenshot before and after critical actions for debugging - -### 5. Verification Patterns -- Always verify actions completed successfully -- Use JavaScript extraction to check application state -- Combine element extraction with JavaScript validation - -## Debugging Failed Tests - -### 1. Capture Current State -```json -// Get page source for analysis -{ - "name": "web_extract", - "arguments": { - "type": "source" - } -} - -// Take screenshot to see visual state -{ - "name": "web_screenshot", - "arguments": { - "output": "/tmp/debug-state.png", - "full_page": true - } -} - -// Check JavaScript console errors -{ - "name": "web_extract", - "arguments": { - "type": "javascript", - "code": "console.error.toString()" - } -} -``` - -### 2. Element Debugging -```json -// Check if element exists -{ - "name": "web_extract", - "arguments": { - "type": "javascript", - "code": "document.querySelector('#my-element') !== null" - } -} - -// Get element properties -{ - "name": "web_extract", - "arguments": { - "type": "javascript", - "code": "JSON.stringify({visible: document.querySelector('#my-element')?.offsetParent !== null, text: document.querySelector('#my-element')?.textContent})" - } -} -``` - -### 3. Network and Loading Issues -```json -// Check if page is still loading -{ - "name": "web_extract", - "arguments": { - "type": "javascript", - "code": "document.readyState" - } -} - -// Check for JavaScript errors -{ - "name": "web_extract", - "arguments": { - "type": "javascript", - "code": "window.onerror ? 'Errors detected' : 'No errors'" - } -} -``` - -## Advantages Over CLI Commands - -### 1. **Automatic State Management** -- No need to manually track tab IDs -- Automatic current tab resolution -- Persistent iframe context tracking - -### 2. **Rich Error Context** -- Detailed error messages with context -- Automatic screenshot capture on failures -- Structured error responses for better debugging - -### 3. **Intelligent Abstractions** -- Combined operations in single tools -- Smart parameter defaults and validation -- Automatic resource management - -### 4. **Better Performance** -- Direct library integration (no subprocess overhead) -- Persistent connections to cremote daemon -- Efficient state tracking - -### 5. **Structured Responses** -- Consistent JSON format for all responses -- Rich metadata for decision making -- Easy parsing and error handling - -## Key Differences from CLI Usage - -| Aspect | CLI Commands | MCP Server | -|--------|-------------|------------| -| **State Tracking** | Manual tab ID management | Automatic state management | -| **Error Handling** | Text parsing required | Structured error objects | -| **Screenshots** | Manual command execution | Automatic capture options | -| **Performance** | Subprocess overhead | Direct library calls | -| **Response Format** | Text output | Structured JSON | -| **Context Management** | Manual iframe tracking | Automatic context switching | -| **Resource Cleanup** | Manual tab management | Automatic resource tracking | - -The Cremote MCP Server transforms web automation from a series of CLI commands into an intelligent, stateful API that's optimized for AI-driven testing and automation workflows. diff --git a/mcp/cremote-mcp2 b/mcp/cremote-mcp2 new file mode 100755 index 0000000..86bb0af Binary files /dev/null and b/mcp/cremote-mcp2 differ