diff --git a/.github/workflows/on_schedule_tests.yaml b/.github/workflows/on_schedule_tests.yaml index fafe09fcca..116a9c8a5e 100644 --- a/.github/workflows/on_schedule_tests.yaml +++ b/.github/workflows/on_schedule_tests.yaml @@ -24,8 +24,8 @@ jobs: fail-fast: false max-parallel: 12 matrix: - crawler-type: ["playwright_camoufox", "playwright", "parsel", "beautifulsoup"] - http-client: [ "httpx", "curl_impersonate"] + crawler-type: ["playwright_camoufox", "playwright_chrome", "playwright_firefox", "playwright_webkit", "playwright", "parsel", "beautifulsoup"] + http-client: ["httpx", "curl_impersonate"] package-manager: ["pip", "uv", "poetry"] runs-on: "ubuntu-latest" diff --git a/src/crawlee/browsers/_playwright_browser_plugin.py b/src/crawlee/browsers/_playwright_browser_plugin.py index fe9eb09e6e..755aec68e1 100644 --- a/src/crawlee/browsers/_playwright_browser_plugin.py +++ b/src/crawlee/browsers/_playwright_browser_plugin.py @@ -82,11 +82,7 @@ def __init__( 'executable_path': config.default_browser_path, 'chromium_sandbox': not config.disable_browser_sandbox, } - - if browser_type == 'chrome' and default_launch_browser_options['executable_path']: - raise ValueError( - 'Cannot use browser_type `chrome` with `Configuration.default_browser_path` or `executable_path` set.' - ) + explicit_browser_launch_options = browser_launch_options or {} # Map 'chrome' to 'chromium' with the 'chrome' channel. if browser_type == 'chrome': @@ -94,8 +90,15 @@ def __init__( # Chromium parameter 'channel' set to 'chrome' enables using installed Google Chrome. default_launch_browser_options['channel'] = 'chrome' + if executable_path := explicit_browser_launch_options.get( + 'executable_path', default_launch_browser_options.get('executable_path') + ): + logger.debug( + f"Using browser executable from {executable_path}, which takes precedence over 'chrome' channel." + ) + self._browser_type: BrowserType = browser_type - self._browser_launch_options: dict[str, Any] = default_launch_browser_options | (browser_launch_options or {}) + self._browser_launch_options: dict[str, Any] = default_launch_browser_options | explicit_browser_launch_options self._browser_new_context_options = browser_new_context_options or {} self._max_open_pages_per_browser = max_open_pages_per_browser self._use_incognito_pages = use_incognito_pages diff --git a/src/crawlee/project_template/cookiecutter.json b/src/crawlee/project_template/cookiecutter.json index 9026851051..53e0c8f445 100644 --- a/src/crawlee/project_template/cookiecutter.json +++ b/src/crawlee/project_template/cookiecutter.json @@ -1,7 +1,7 @@ { "project_name": "crawlee-python-project", "__package_name": "{{ cookiecutter.project_name|lower|replace('-', '_') }}", - "crawler_type": ["beautifulsoup", "parsel", "playwright", "playwright-camoufox"], + "crawler_type": ["beautifulsoup", "parsel", "playwright", "playwright-camoufox", "playwright-chrome", "playwright-firefox", "playwright-webkit"], "__crawler_type": "{{ cookiecutter.crawler_type|lower|replace('-', '_') }}", "http_client": ["impit", "httpx", "curl-impersonate"], "package_manager": ["poetry", "pip", "uv"], diff --git a/src/crawlee/project_template/templates/main_playwright_chrome.py b/src/crawlee/project_template/templates/main_playwright_chrome.py new file mode 100644 index 0000000000..e40ffb7426 --- /dev/null +++ b/src/crawlee/project_template/templates/main_playwright_chrome.py @@ -0,0 +1,15 @@ +# % extends 'main.py' + +# % block import +from crawlee.crawlers import PlaywrightCrawler +# % endblock + +# % block instantiation +crawler = PlaywrightCrawler( + request_handler=router, + headless=True, + max_requests_per_crawl=10, + browser_type="chrome", + {{ self.http_client_instantiation() }} +) +# % endblock diff --git a/src/crawlee/project_template/templates/main_playwright_firefox.py b/src/crawlee/project_template/templates/main_playwright_firefox.py new file mode 100644 index 0000000000..d18f45f1a3 --- /dev/null +++ b/src/crawlee/project_template/templates/main_playwright_firefox.py @@ -0,0 +1,15 @@ +# % extends 'main.py' + +# % block import +from crawlee.crawlers import PlaywrightCrawler +# % endblock + +# % block instantiation +crawler = PlaywrightCrawler( + request_handler=router, + headless=True, + max_requests_per_crawl=10, + browser_type="firefox", + {{ self.http_client_instantiation() }} +) +# % endblock diff --git a/src/crawlee/project_template/templates/main_playwright_webkit.py b/src/crawlee/project_template/templates/main_playwright_webkit.py new file mode 100644 index 0000000000..ace63d2213 --- /dev/null +++ b/src/crawlee/project_template/templates/main_playwright_webkit.py @@ -0,0 +1,15 @@ +# % extends 'main.py' + +# % block import +from crawlee.crawlers import PlaywrightCrawler +# % endblock + +# % block instantiation +crawler = PlaywrightCrawler( + request_handler=router, + headless=True, + max_requests_per_crawl=10, + browser_type="webkit", + {{ self.http_client_instantiation() }} +) +# % endblock diff --git a/src/crawlee/project_template/templates/routes_camoufox.py b/src/crawlee/project_template/templates/routes_camoufox.py deleted file mode 100644 index 9ab35a2a80..0000000000 --- a/src/crawlee/project_template/templates/routes_camoufox.py +++ /dev/null @@ -1,19 +0,0 @@ -from crawlee.crawlers import PlaywrightCrawlingContext -from crawlee.router import Router - -router = Router[PlaywrightCrawlingContext]() - - -@router.default_handler -async def default_handler(context: PlaywrightCrawlingContext) -> None: - """Default request handler.""" - context.log.info(f'Processing {context.request.url} ...') - title = await context.page.query_selector('title') - await context.push_data( - { - 'url': context.request.loaded_url, - 'title': await title.inner_text() if title else None, - } - ) - - await context.enqueue_links() diff --git a/src/crawlee/project_template/templates/routes_playwright_camoufox.py b/src/crawlee/project_template/templates/routes_playwright_camoufox.py deleted file mode 100644 index 9ab35a2a80..0000000000 --- a/src/crawlee/project_template/templates/routes_playwright_camoufox.py +++ /dev/null @@ -1,19 +0,0 @@ -from crawlee.crawlers import PlaywrightCrawlingContext -from crawlee.router import Router - -router = Router[PlaywrightCrawlingContext]() - - -@router.default_handler -async def default_handler(context: PlaywrightCrawlingContext) -> None: - """Default request handler.""" - context.log.info(f'Processing {context.request.url} ...') - title = await context.page.query_selector('title') - await context.push_data( - { - 'url': context.request.loaded_url, - 'title': await title.inner_text() if title else None, - } - ) - - await context.enqueue_links() diff --git a/src/crawlee/project_template/{{cookiecutter.project_name}}/Dockerfile b/src/crawlee/project_template/{{cookiecutter.project_name}}/Dockerfile index dda254f8f0..323181d058 100644 --- a/src/crawlee/project_template/{{cookiecutter.project_name}}/Dockerfile +++ b/src/crawlee/project_template/{{cookiecutter.project_name}}/Dockerfile @@ -4,8 +4,13 @@ # % if cookiecutter.crawler_type == 'playwright' FROM apify/actor-python-playwright:3.13 # % elif cookiecutter.crawler_type == 'playwright-camoufox' -# Currently camoufox has issues installing on Python 3.13 -FROM apify/actor-python-playwright:3.12 +FROM apify/actor-python-playwright-camoufox:3.13 +# % elif cookiecutter.crawler_type == 'playwright-chrome' +FROM apify/actor-python-playwright-chrome:3.13 +# % elif cookiecutter.crawler_type == 'playwright-firefox' +FROM apify/actor-python-playwright-firefox:3.13 +# % elif cookiecutter.crawler_type == 'playwright-webkit' +FROM apify/actor-python-playwright-webkit:3.13 # % else FROM apify/actor-python:3.13 # % endif diff --git a/src/crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml b/src/crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml index fd6d28c7c2..2de6aa2532 100644 --- a/src/crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +++ b/src/crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml @@ -1,4 +1,4 @@ -# % if cookiecutter.crawler_type == 'playwright-camoufox' +# % if cookiecutter.crawler_type.startswith('playwright') # % set extras = ['playwright'] # % else # % set extras = [cookiecutter.crawler_type] diff --git a/src/crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt b/src/crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt index ef69bdc1e4..8ac28ed5e4 100644 --- a/src/crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +++ b/src/crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt @@ -1,5 +1,7 @@ # % if cookiecutter.crawler_type == 'playwright-camoufox' camoufox[geoip]~=0.4.5 +# % endif +# % if cookiecutter.crawler_type.startswith('playwright') # % set extras = ['playwright'] # % else # % set extras = [cookiecutter.crawler_type] diff --git a/src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/routes.py b/src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/routes.py index ab665288b7..7ee8583b82 100644 --- a/src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/routes.py +++ b/src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/routes.py @@ -1 +1,5 @@ +# % if cookiecutter.crawler_type.startswith('playwright') +# % include 'routes_playwright.py' +# % else # % include 'routes_%s.py' % cookiecutter.__crawler_type +# % endif diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py index a2b0e13cd8..81945a760a 100644 --- a/tests/e2e/conftest.py +++ b/tests/e2e/conftest.py @@ -15,6 +15,9 @@ def pytest_configure(config: Config) -> None: 'impit', 'playwright', 'playwright_camoufox', + 'playwright_chrome', + 'playwright_firefox', + 'playwright_webkit', 'parsel', 'beautifulsoup', 'uv', diff --git a/tests/e2e/project_template/test_static_crawlers_templates.py b/tests/e2e/project_template/test_static_crawlers_templates.py index cb951587e0..34a30b61ae 100644 --- a/tests/e2e/project_template/test_static_crawlers_templates.py +++ b/tests/e2e/project_template/test_static_crawlers_templates.py @@ -20,6 +20,9 @@ 'crawler_type', [ pytest.param('playwright-camoufox', marks=pytest.mark.playwright_camoufox), + pytest.param('playwright-chrome', marks=pytest.mark.playwright_chrome), + pytest.param('playwright-firefox', marks=pytest.mark.playwright_firefox), + pytest.param('playwright-webkit', marks=pytest.mark.playwright_webkit), pytest.param('playwright', marks=pytest.mark.playwright), pytest.param('parsel', marks=pytest.mark.parsel), pytest.param('beautifulsoup', marks=pytest.mark.beautifulsoup),