From e360ccbf152f72a067edfcf8955f645c2b5417b7 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 18 Feb 2026 08:33:21 +0100 Subject: [PATCH 1/5] Update tesmplates and e2e tests to use specialized Playwright docker images --- .github/workflows/on_schedule_tests.yaml | 4 ++-- .../project_template/cookiecutter.json | 2 +- .../templates/main_playwright_chrome.py | 14 ++++++++++++++ .../templates/main_playwright_firefox.py | 14 ++++++++++++++ .../templates/main_playwright_webkit.py | 14 ++++++++++++++ .../templates/routes_playwright_camoufox.py | 19 ------------------- .../{{cookiecutter.project_name}}/Dockerfile | 9 +++++++-- .../pyproject.toml | 2 +- .../requirements.txt | 2 ++ .../{{cookiecutter.__package_name}}/routes.py | 4 ++++ .../test_static_crawlers_templates.py | 3 +++ 11 files changed, 62 insertions(+), 25 deletions(-) create mode 100644 src/crawlee/project_template/templates/main_playwright_chrome.py create mode 100644 src/crawlee/project_template/templates/main_playwright_firefox.py create mode 100644 src/crawlee/project_template/templates/main_playwright_webkit.py delete mode 100644 src/crawlee/project_template/templates/routes_playwright_camoufox.py diff --git a/.github/workflows/on_schedule_tests.yaml b/.github/workflows/on_schedule_tests.yaml index fafe09fcca..116a9c8a5e 100644 --- a/.github/workflows/on_schedule_tests.yaml +++ b/.github/workflows/on_schedule_tests.yaml @@ -24,8 +24,8 @@ jobs: fail-fast: false max-parallel: 12 matrix: - crawler-type: ["playwright_camoufox", "playwright", "parsel", "beautifulsoup"] - http-client: [ "httpx", "curl_impersonate"] + crawler-type: ["playwright_camoufox", "playwright_chrome", "playwright_firefox", "playwright_webkit", "playwright", "parsel", "beautifulsoup"] + http-client: ["httpx", "curl_impersonate"] package-manager: ["pip", "uv", "poetry"] runs-on: "ubuntu-latest" diff --git a/src/crawlee/project_template/cookiecutter.json b/src/crawlee/project_template/cookiecutter.json index 9026851051..53e0c8f445 100644 --- a/src/crawlee/project_template/cookiecutter.json +++ b/src/crawlee/project_template/cookiecutter.json @@ -1,7 +1,7 @@ { "project_name": "crawlee-python-project", "__package_name": "{{ cookiecutter.project_name|lower|replace('-', '_') }}", - "crawler_type": ["beautifulsoup", "parsel", "playwright", "playwright-camoufox"], + "crawler_type": ["beautifulsoup", "parsel", "playwright", "playwright-camoufox", "playwright-chrome", "playwright-firefox", "playwright-webkit"], "__crawler_type": "{{ cookiecutter.crawler_type|lower|replace('-', '_') }}", "http_client": ["impit", "httpx", "curl-impersonate"], "package_manager": ["poetry", "pip", "uv"], diff --git a/src/crawlee/project_template/templates/main_playwright_chrome.py b/src/crawlee/project_template/templates/main_playwright_chrome.py new file mode 100644 index 0000000000..a855df1d0f --- /dev/null +++ b/src/crawlee/project_template/templates/main_playwright_chrome.py @@ -0,0 +1,14 @@ +# % extends 'main.py' + +# % block import +from crawlee.crawlers import PlaywrightCrawler +# % endblock + +# % block instantiation +crawler = PlaywrightCrawler( + request_handler=router, + headless=True, + max_requests_per_crawl=10, + browser_type="chrome", + {{ self.http_client_instantiation() }}) +# % endblock diff --git a/src/crawlee/project_template/templates/main_playwright_firefox.py b/src/crawlee/project_template/templates/main_playwright_firefox.py new file mode 100644 index 0000000000..dc05a681d1 --- /dev/null +++ b/src/crawlee/project_template/templates/main_playwright_firefox.py @@ -0,0 +1,14 @@ +# % extends 'main.py' + +# % block import +from crawlee.crawlers import PlaywrightCrawler +# % endblock + +# % block instantiation +crawler = PlaywrightCrawler( + request_handler=router, + headless=True, + max_requests_per_crawl=10, + browser_type="firefox", + {{ self.http_client_instantiation() }}) +# % endblock diff --git a/src/crawlee/project_template/templates/main_playwright_webkit.py b/src/crawlee/project_template/templates/main_playwright_webkit.py new file mode 100644 index 0000000000..d5d2cd0a1a --- /dev/null +++ b/src/crawlee/project_template/templates/main_playwright_webkit.py @@ -0,0 +1,14 @@ +# % extends 'main.py' + +# % block import +from crawlee.crawlers import PlaywrightCrawler +# % endblock + +# % block instantiation +crawler = PlaywrightCrawler( + request_handler=router, + headless=True, + max_requests_per_crawl=10, + browser_type="webkit", + {{ self.http_client_instantiation() }}) +# % endblock diff --git a/src/crawlee/project_template/templates/routes_playwright_camoufox.py b/src/crawlee/project_template/templates/routes_playwright_camoufox.py deleted file mode 100644 index 9ab35a2a80..0000000000 --- a/src/crawlee/project_template/templates/routes_playwright_camoufox.py +++ /dev/null @@ -1,19 +0,0 @@ -from crawlee.crawlers import PlaywrightCrawlingContext -from crawlee.router import Router - -router = Router[PlaywrightCrawlingContext]() - - -@router.default_handler -async def default_handler(context: PlaywrightCrawlingContext) -> None: - """Default request handler.""" - context.log.info(f'Processing {context.request.url} ...') - title = await context.page.query_selector('title') - await context.push_data( - { - 'url': context.request.loaded_url, - 'title': await title.inner_text() if title else None, - } - ) - - await context.enqueue_links() diff --git a/src/crawlee/project_template/{{cookiecutter.project_name}}/Dockerfile b/src/crawlee/project_template/{{cookiecutter.project_name}}/Dockerfile index dda254f8f0..323181d058 100644 --- a/src/crawlee/project_template/{{cookiecutter.project_name}}/Dockerfile +++ b/src/crawlee/project_template/{{cookiecutter.project_name}}/Dockerfile @@ -4,8 +4,13 @@ # % if cookiecutter.crawler_type == 'playwright' FROM apify/actor-python-playwright:3.13 # % elif cookiecutter.crawler_type == 'playwright-camoufox' -# Currently camoufox has issues installing on Python 3.13 -FROM apify/actor-python-playwright:3.12 +FROM apify/actor-python-playwright-camoufox:3.13 +# % elif cookiecutter.crawler_type == 'playwright-chrome' +FROM apify/actor-python-playwright-chrome:3.13 +# % elif cookiecutter.crawler_type == 'playwright-firefox' +FROM apify/actor-python-playwright-firefox:3.13 +# % elif cookiecutter.crawler_type == 'playwright-webkit' +FROM apify/actor-python-playwright-webkit:3.13 # % else FROM apify/actor-python:3.13 # % endif diff --git a/src/crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml b/src/crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml index fd6d28c7c2..2de6aa2532 100644 --- a/src/crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +++ b/src/crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml @@ -1,4 +1,4 @@ -# % if cookiecutter.crawler_type == 'playwright-camoufox' +# % if cookiecutter.crawler_type.startswith('playwright') # % set extras = ['playwright'] # % else # % set extras = [cookiecutter.crawler_type] diff --git a/src/crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt b/src/crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt index ef69bdc1e4..8ac28ed5e4 100644 --- a/src/crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +++ b/src/crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt @@ -1,5 +1,7 @@ # % if cookiecutter.crawler_type == 'playwright-camoufox' camoufox[geoip]~=0.4.5 +# % endif +# % if cookiecutter.crawler_type.startswith('playwright') # % set extras = ['playwright'] # % else # % set extras = [cookiecutter.crawler_type] diff --git a/src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/routes.py b/src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/routes.py index ab665288b7..7ee8583b82 100644 --- a/src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/routes.py +++ b/src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/routes.py @@ -1 +1,5 @@ +# % if cookiecutter.crawler_type.startswith('playwright') +# % include 'routes_playwright.py' +# % else # % include 'routes_%s.py' % cookiecutter.__crawler_type +# % endif diff --git a/tests/e2e/project_template/test_static_crawlers_templates.py b/tests/e2e/project_template/test_static_crawlers_templates.py index cb951587e0..34a30b61ae 100644 --- a/tests/e2e/project_template/test_static_crawlers_templates.py +++ b/tests/e2e/project_template/test_static_crawlers_templates.py @@ -20,6 +20,9 @@ 'crawler_type', [ pytest.param('playwright-camoufox', marks=pytest.mark.playwright_camoufox), + pytest.param('playwright-chrome', marks=pytest.mark.playwright_chrome), + pytest.param('playwright-firefox', marks=pytest.mark.playwright_firefox), + pytest.param('playwright-webkit', marks=pytest.mark.playwright_webkit), pytest.param('playwright', marks=pytest.mark.playwright), pytest.param('parsel', marks=pytest.mark.parsel), pytest.param('beautifulsoup', marks=pytest.mark.beautifulsoup), From 1e4a7ec5f1e06f4d56b51e7f6c3cab5828dcc781 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 18 Feb 2026 10:26:25 +0100 Subject: [PATCH 2/5] Replace exception when using `chrome` and having explicit `executable_path` by debug log --- src/crawlee/browsers/_playwright_browser_plugin.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/crawlee/browsers/_playwright_browser_plugin.py b/src/crawlee/browsers/_playwright_browser_plugin.py index fe9eb09e6e..b23551479d 100644 --- a/src/crawlee/browsers/_playwright_browser_plugin.py +++ b/src/crawlee/browsers/_playwright_browser_plugin.py @@ -83,16 +83,16 @@ def __init__( 'chromium_sandbox': not config.disable_browser_sandbox, } - if browser_type == 'chrome' and default_launch_browser_options['executable_path']: - raise ValueError( - 'Cannot use browser_type `chrome` with `Configuration.default_browser_path` or `executable_path` set.' - ) - # Map 'chrome' to 'chromium' with the 'chrome' channel. if browser_type == 'chrome': browser_type = 'chromium' # Chromium parameter 'channel' set to 'chrome' enables using installed Google Chrome. default_launch_browser_options['channel'] = 'chrome' + if default_launch_browser_options['executable_path']: + logger.debug( + f'Using browser executable from {default_launch_browser_options["executable_path"]},' + f" which takes precedence over 'chrome' channel." + ) self._browser_type: BrowserType = browser_type self._browser_launch_options: dict[str, Any] = default_launch_browser_options | (browser_launch_options or {}) From 5820e073305843ec8c53178398f8cc7d0ef80074 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 18 Feb 2026 15:48:35 +0100 Subject: [PATCH 3/5] ai review comments --- src/crawlee/browsers/_playwright_browser_plugin.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/crawlee/browsers/_playwright_browser_plugin.py b/src/crawlee/browsers/_playwright_browser_plugin.py index b23551479d..2b68a1c91c 100644 --- a/src/crawlee/browsers/_playwright_browser_plugin.py +++ b/src/crawlee/browsers/_playwright_browser_plugin.py @@ -82,20 +82,24 @@ def __init__( 'executable_path': config.default_browser_path, 'chromium_sandbox': not config.disable_browser_sandbox, } + explicit_browser_launch_options = browser_launch_options or {} # Map 'chrome' to 'chromium' with the 'chrome' channel. if browser_type == 'chrome': browser_type = 'chromium' # Chromium parameter 'channel' set to 'chrome' enables using installed Google Chrome. default_launch_browser_options['channel'] = 'chrome' - if default_launch_browser_options['executable_path']: + + if explicit_browser_launch_options.get( + 'executable_path', default_launch_browser_options.get('executable_path') + ): logger.debug( f'Using browser executable from {default_launch_browser_options["executable_path"]},' f" which takes precedence over 'chrome' channel." ) self._browser_type: BrowserType = browser_type - self._browser_launch_options: dict[str, Any] = default_launch_browser_options | (browser_launch_options or {}) + self._browser_launch_options: dict[str, Any] = default_launch_browser_options | explicit_browser_launch_options self._browser_new_context_options = browser_new_context_options or {} self._max_open_pages_per_browser = max_open_pages_per_browser self._use_incognito_pages = use_incognito_pages From 42a89efa08fd572d406bf05fbe9f046e959da637 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Thu, 19 Feb 2026 08:34:17 +0100 Subject: [PATCH 4/5] Review comments --- src/crawlee/browsers/_playwright_browser_plugin.py | 5 ++--- .../project_template/templates/main_playwright_chrome.py | 3 ++- .../project_template/templates/main_playwright_firefox.py | 3 ++- .../project_template/templates/main_playwright_webkit.py | 3 ++- src/crawlee/project_template/templates/routes_camoufox.py | 2 +- tests/e2e/conftest.py | 3 +++ 6 files changed, 12 insertions(+), 7 deletions(-) diff --git a/src/crawlee/browsers/_playwright_browser_plugin.py b/src/crawlee/browsers/_playwright_browser_plugin.py index 2b68a1c91c..755aec68e1 100644 --- a/src/crawlee/browsers/_playwright_browser_plugin.py +++ b/src/crawlee/browsers/_playwright_browser_plugin.py @@ -90,12 +90,11 @@ def __init__( # Chromium parameter 'channel' set to 'chrome' enables using installed Google Chrome. default_launch_browser_options['channel'] = 'chrome' - if explicit_browser_launch_options.get( + if executable_path := explicit_browser_launch_options.get( 'executable_path', default_launch_browser_options.get('executable_path') ): logger.debug( - f'Using browser executable from {default_launch_browser_options["executable_path"]},' - f" which takes precedence over 'chrome' channel." + f"Using browser executable from {executable_path}, which takes precedence over 'chrome' channel." ) self._browser_type: BrowserType = browser_type diff --git a/src/crawlee/project_template/templates/main_playwright_chrome.py b/src/crawlee/project_template/templates/main_playwright_chrome.py index a855df1d0f..e40ffb7426 100644 --- a/src/crawlee/project_template/templates/main_playwright_chrome.py +++ b/src/crawlee/project_template/templates/main_playwright_chrome.py @@ -10,5 +10,6 @@ headless=True, max_requests_per_crawl=10, browser_type="chrome", - {{ self.http_client_instantiation() }}) + {{ self.http_client_instantiation() }} +) # % endblock diff --git a/src/crawlee/project_template/templates/main_playwright_firefox.py b/src/crawlee/project_template/templates/main_playwright_firefox.py index dc05a681d1..d18f45f1a3 100644 --- a/src/crawlee/project_template/templates/main_playwright_firefox.py +++ b/src/crawlee/project_template/templates/main_playwright_firefox.py @@ -10,5 +10,6 @@ headless=True, max_requests_per_crawl=10, browser_type="firefox", - {{ self.http_client_instantiation() }}) + {{ self.http_client_instantiation() }} +) # % endblock diff --git a/src/crawlee/project_template/templates/main_playwright_webkit.py b/src/crawlee/project_template/templates/main_playwright_webkit.py index d5d2cd0a1a..ace63d2213 100644 --- a/src/crawlee/project_template/templates/main_playwright_webkit.py +++ b/src/crawlee/project_template/templates/main_playwright_webkit.py @@ -10,5 +10,6 @@ headless=True, max_requests_per_crawl=10, browser_type="webkit", - {{ self.http_client_instantiation() }}) + {{ self.http_client_instantiation() }} +) # % endblock diff --git a/src/crawlee/project_template/templates/routes_camoufox.py b/src/crawlee/project_template/templates/routes_camoufox.py index 9ab35a2a80..c0f0c62336 100644 --- a/src/crawlee/project_template/templates/routes_camoufox.py +++ b/src/crawlee/project_template/templates/routes_camoufox.py @@ -1,4 +1,4 @@ -from crawlee.crawlers import PlaywrightCrawlingContext +rom crawlee.crawlers import PlaywrightCrawlingContext from crawlee.router import Router router = Router[PlaywrightCrawlingContext]() diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py index a2b0e13cd8..81945a760a 100644 --- a/tests/e2e/conftest.py +++ b/tests/e2e/conftest.py @@ -15,6 +15,9 @@ def pytest_configure(config: Config) -> None: 'impit', 'playwright', 'playwright_camoufox', + 'playwright_chrome', + 'playwright_firefox', + 'playwright_webkit', 'parsel', 'beautifulsoup', 'uv', From 5dd7346de77dd0c50f0561b1e4d7ff1790246e6d Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Thu, 19 Feb 2026 17:42:07 +0100 Subject: [PATCH 5/5] Delete the redundant file finally --- .../templates/routes_camoufox.py | 19 ------------------- 1 file changed, 19 deletions(-) delete mode 100644 src/crawlee/project_template/templates/routes_camoufox.py diff --git a/src/crawlee/project_template/templates/routes_camoufox.py b/src/crawlee/project_template/templates/routes_camoufox.py deleted file mode 100644 index c0f0c62336..0000000000 --- a/src/crawlee/project_template/templates/routes_camoufox.py +++ /dev/null @@ -1,19 +0,0 @@ -rom crawlee.crawlers import PlaywrightCrawlingContext -from crawlee.router import Router - -router = Router[PlaywrightCrawlingContext]() - - -@router.default_handler -async def default_handler(context: PlaywrightCrawlingContext) -> None: - """Default request handler.""" - context.log.info(f'Processing {context.request.url} ...') - title = await context.page.query_selector('title') - await context.push_data( - { - 'url': context.request.loaded_url, - 'title': await title.inner_text() if title else None, - } - ) - - await context.enqueue_links()