From 943f5b7654f7b35c90248794398667a302444622 Mon Sep 17 00:00:00 2001 From: "user.mail" Date: Mon, 9 Feb 2026 16:53:34 +0300 Subject: [PATCH 1/2] Add Datasets API with 8 datasets and demo notebooks - Datasets: LinkedIn (profiles, companies), Amazon, Crunchbase, IMDB, NBA, Goodreads, World Population - Export utilities: export_json, export_csv, export_jsonl - Notebooks: linkedin, amazon, crunchbase demos --- CHANGELOG.md | 34 + LICENSE | 1 - notebooks/datasets/amazon/amazon.ipynb | 490 ++++++++++++ .../datasets/crunchbase/crunchbase.ipynb | 588 ++++++++++++++ notebooks/datasets/linkedin/linkedin.ipynb | 754 ++++++++++++++++++ pyproject.toml | 2 +- requirements.txt | 1 - src/brightdata/cli/README.md | 1 - src/brightdata/cli/banner.py | 12 +- src/brightdata/client.py | 31 + src/brightdata/datasets/__init__.py | 52 ++ src/brightdata/datasets/amazon/__init__.py | 5 + src/brightdata/datasets/amazon/products.py | 412 ++++++++++ src/brightdata/datasets/base.py | 221 +++++ src/brightdata/datasets/client.py | 136 ++++ .../datasets/crunchbase/__init__.py | 5 + .../datasets/crunchbase/companies.py | 602 ++++++++++++++ src/brightdata/datasets/goodreads/__init__.py | 5 + src/brightdata/datasets/goodreads/books.py | 121 +++ src/brightdata/datasets/imdb/__init__.py | 5 + src/brightdata/datasets/imdb/movies.py | 195 +++++ src/brightdata/datasets/linkedin/__init__.py | 6 + .../datasets/linkedin/company_profiles.py | 197 +++++ .../datasets/linkedin/people_profiles.py | 285 +++++++ src/brightdata/datasets/models.py | 73 ++ src/brightdata/datasets/nba/__init__.py | 5 + src/brightdata/datasets/nba/players_stats.py | 136 ++++ src/brightdata/datasets/utils.py | 139 ++++ .../datasets/world_population/__init__.py | 5 + .../datasets/world_population/countries.py | 155 ++++ src/brightdata/utils/ssl_helpers.py | 4 +- 31 files changed, 4666 insertions(+), 12 deletions(-) create mode 100644 notebooks/datasets/amazon/amazon.ipynb create mode 100644 notebooks/datasets/crunchbase/crunchbase.ipynb create mode 100644 notebooks/datasets/linkedin/linkedin.ipynb create mode 100644 src/brightdata/datasets/__init__.py create mode 100644 src/brightdata/datasets/amazon/__init__.py create mode 100644 src/brightdata/datasets/amazon/products.py create mode 100644 src/brightdata/datasets/base.py create mode 100644 src/brightdata/datasets/client.py create mode 100644 src/brightdata/datasets/crunchbase/__init__.py create mode 100644 src/brightdata/datasets/crunchbase/companies.py create mode 100644 src/brightdata/datasets/goodreads/__init__.py create mode 100644 src/brightdata/datasets/goodreads/books.py create mode 100644 src/brightdata/datasets/imdb/__init__.py create mode 100644 src/brightdata/datasets/imdb/movies.py create mode 100644 src/brightdata/datasets/linkedin/__init__.py create mode 100644 src/brightdata/datasets/linkedin/company_profiles.py create mode 100644 src/brightdata/datasets/linkedin/people_profiles.py create mode 100644 src/brightdata/datasets/models.py create mode 100644 src/brightdata/datasets/nba/__init__.py create mode 100644 src/brightdata/datasets/nba/players_stats.py create mode 100644 src/brightdata/datasets/utils.py create mode 100644 src/brightdata/datasets/world_population/__init__.py create mode 100644 src/brightdata/datasets/world_population/countries.py diff --git a/CHANGELOG.md b/CHANGELOG.md index fc1b06f..ee5c5dd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,39 @@ # Bright Data Python SDK Changelog +## Version 2.2.0 - Datasets API + +### ✨ New Features + +#### Datasets API +Access Bright Data's pre-collected datasets with filtering and export capabilities. + +```python +async with BrightDataClient() as client: + # Filter dataset records + snapshot_id = await client.datasets.amazon_products.filter( + filter={"name": "rating", "operator": ">=", "value": 4.5}, + records_limit=100 + ) + # Download results + data = await client.datasets.amazon_products.download(snapshot_id) +``` + +**8 Datasets:** LinkedIn Profiles, LinkedIn Companies, Amazon Products, Crunchbase Companies, IMDB Movies, NBA Players Stats, Goodreads Books, World Population + +**Export Utilities:** +```python +from brightdata.datasets import export_json, export_csv +export_json(data, "results.json") +export_csv(data, "results.csv") +``` + +### 📓 Notebooks +- `notebooks/datasets/linkedin/linkedin.ipynb` - LinkedIn datasets (profiles & companies) +- `notebooks/datasets/amazon/amazon.ipynb` - Amazon products dataset +- `notebooks/datasets/crunchbase/crunchbase.ipynb` - Crunchbase companies dataset + +--- + ## Version 2.1.2 - Web Scrapers & Notebooks ### 🐛 Bug Fixes diff --git a/LICENSE b/LICENSE index 3743c5b..f67927a 100644 --- a/LICENSE +++ b/LICENSE @@ -19,4 +19,3 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - diff --git a/notebooks/datasets/amazon/amazon.ipynb b/notebooks/datasets/amazon/amazon.ipynb new file mode 100644 index 0000000..9ca9e47 --- /dev/null +++ b/notebooks/datasets/amazon/amazon.ipynb @@ -0,0 +1,490 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 🛒 Amazon Products Dataset API\n", + "\n", + "Access Bright Data's pre-collected Amazon Products dataset:\n", + "- **85 fields** including pricing, ratings, reviews, categories, and more\n", + "- Filter by price, rating, brand, category, availability, and other criteria\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "API Token: 7011787d-2...3336\n", + "Setup complete!\n" + ] + } + ], + "source": [ + "import os\n", + "from dotenv import load_dotenv\n", + "load_dotenv()\n", + "\n", + "API_TOKEN = os.getenv(\"BRIGHTDATA_API_TOKEN\")\n", + "if not API_TOKEN:\n", + " raise ValueError(\"Set BRIGHTDATA_API_TOKEN in .env file\")\n", + "\n", + "print(f\"API Token: {API_TOKEN[:10]}...{API_TOKEN[-4:]}\")\n", + "print(\"Setup complete!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initialize Client" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Client initialized\n" + ] + } + ], + "source": [ + "from brightdata import BrightDataClient\n", + "\n", + "client = BrightDataClient(token=API_TOKEN)\n", + "\n", + "print(\"Client initialized\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Test 1: Explore Amazon Products Fields\n", + "\n", + "Before filtering, explore available fields using the class metadata." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== Amazon Products Dataset ===\n", + "Dataset ID: gd_l7q7dkf244hwjntr0\n", + "Total fields: 85\n", + "\n", + "Field types:\n", + " Text fields: 38\n", + " Number fields: 14\n", + " Array fields: 15\n", + " Boolean fields: 7\n", + " URL fields: 7\n" + ] + } + ], + "source": [ + "from brightdata.datasets import AmazonProducts\n", + "\n", + "print(\"=== Amazon Products Dataset ===\")\n", + "print(f\"Dataset ID: {AmazonProducts.DATASET_ID}\")\n", + "print(f\"Total fields: {len(AmazonProducts.FIELDS)}\")\n", + "\n", + "# Show field types breakdown\n", + "print(f\"\\nField types:\")\n", + "print(f\" Text fields: {len(AmazonProducts.get_fields_by_type('text'))}\")\n", + "print(f\" Number fields: {len(AmazonProducts.get_fields_by_type('number'))}\")\n", + "print(f\" Array fields: {len(AmazonProducts.get_fields_by_type('array'))}\")\n", + "print(f\" Boolean fields: {len(AmazonProducts.get_fields_by_type('boolean'))}\")\n", + "print(f\" URL fields: {len(AmazonProducts.get_fields_by_type('url'))}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Test 2: Get Dataset Metadata from API" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fetching Amazon Products metadata from API...\n", + "\n", + "Dataset ID: gd_l7q7dkf244hwjntr0\n", + "Total fields from API: 86\n", + "\n", + "=== Sample Fields ===\n", + " title: text - Product title\n", + " seller_name: text - Seller name\n", + " brand: text - Product brand\n", + " description: text - A brief description of the product\n", + " initial_price: price - Initial price\n", + " currency: text - Currency of the product\n", + " availability: text - Product availability\n", + " reviews_count: number - Number of reviews\n", + " categories: array - Product categories\n", + " parent_asin: text - Parent ASIN of the product\n" + ] + } + ], + "source": [ + "print(\"Fetching Amazon Products metadata from API...\\n\")\n", + "\n", + "async with client:\n", + " metadata = await client.datasets.amazon_products.get_metadata()\n", + "\n", + "print(f\"Dataset ID: {metadata.id}\")\n", + "print(f\"Total fields from API: {len(metadata.fields)}\")\n", + "\n", + "print(\"\\n=== Sample Fields ===\")\n", + "for i, (name, field) in enumerate(list(metadata.fields.items())[:10]):\n", + " print(f\" {name}: {field.type} - {field.description or 'N/A'}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Test 3: Keyword Search with Rating Filter\n", + "\n", + "Search for products by keyword and filter by rating." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Filter: Keyboards with rating >= 4.5\n", + "Records limit: 2\n", + "\n", + "Snapshot created: snap_mley0j875vz72i0rb\n", + "\n", + "Run the next cell to download the data...\n" + ] + } + ], + "source": [ + "# Step 1: Create filter and get snapshot_id\n", + "# Search for keyboards with 4.5+ star rating\n", + "FILTER = {\n", + " \"operator\": \"and\",\n", + " \"filters\": [\n", + " {\"name\": \"title\", \"operator\": \"includes\", \"value\": \"keyboard\"},\n", + " {\"name\": \"rating\", \"operator\": \">=\", \"value\": 4.5}\n", + " ]\n", + "}\n", + "LIMIT = 2\n", + "\n", + "print(\"Filter: Keyboards with rating >= 4.5\")\n", + "print(f\"Records limit: {LIMIT}\\n\")\n", + "\n", + "async with client:\n", + " snapshot_id = await client.datasets.amazon_products.filter(\n", + " filter=FILTER,\n", + " records_limit=LIMIT\n", + " )\n", + "\n", + " \n", + "\n", + "print(f\"Snapshot created: {snapshot_id}\")\n", + "print(\"\\nRun the next cell to download the data...\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading snapshot: snap_mley0j875vz72i0rb\n", + "(This will poll until ready...)\n", + "\n" + ] + }, + { + "ename": "TimeoutError", + "evalue": "Snapshot snap_mley0j875vz72i0rb not ready after 300s (status: scheduled)", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mTimeoutError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[9]\u001b[39m\u001b[32m, line 6\u001b[39m\n\u001b[32m 3\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m\"\u001b[39m\u001b[33m(This will poll until ready...)\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m 5\u001b[39m \u001b[38;5;28;01masync\u001b[39;00m \u001b[38;5;28;01mwith\u001b[39;00m client:\n\u001b[32m----> \u001b[39m\u001b[32m6\u001b[39m data = \u001b[38;5;28;01mawait\u001b[39;00m client.datasets.amazon_products.download(\n\u001b[32m 7\u001b[39m snapshot_id,\n\u001b[32m 8\u001b[39m timeout=\u001b[32m300\u001b[39m,\n\u001b[32m 9\u001b[39m poll_interval=\u001b[32m5\u001b[39m\n\u001b[32m 10\u001b[39m )\n\u001b[32m 12\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mDownloaded \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(data)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m products:\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 13\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m product \u001b[38;5;129;01min\u001b[39;00m data[:\u001b[32m5\u001b[39m]:\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/Desktop/projects/sdk-python/src/brightdata/datasets/base.py:156\u001b[39m, in \u001b[36mBaseDataset.download\u001b[39m\u001b[34m(self, snapshot_id, format, timeout, poll_interval)\u001b[39m\n\u001b[32m 154\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m DatasetError(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mSnapshot failed: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mstatus.error\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m 155\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m time.time() - start_time > timeout:\n\u001b[32m--> \u001b[39m\u001b[32m156\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTimeoutError\u001b[39;00m(\n\u001b[32m 157\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mSnapshot \u001b[39m\u001b[38;5;132;01m{\u001b[39;00msnapshot_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m not ready after \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtimeout\u001b[38;5;132;01m}\u001b[39;00m\u001b[33ms \u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 158\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33m(status: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mstatus.status\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m)\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 159\u001b[39m )\n\u001b[32m 161\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m asyncio.sleep(poll_interval)\n\u001b[32m 163\u001b[39m \u001b[38;5;66;03m# Download data\u001b[39;00m\n", + "\u001b[31mTimeoutError\u001b[39m: Snapshot snap_mley0j875vz72i0rb not ready after 300s (status: scheduled)" + ] + } + ], + "source": [ + "# Step 2: Download data (polls until ready)\n", + "print(f\"Downloading snapshot: {snapshot_id}\")\n", + "print(\"(This will poll until ready...)\\n\")\n", + "\n", + "async with client:\n", + " data = await client.datasets.amazon_products.download(\n", + " snapshot_id,\n", + " timeout=300,\n", + " poll_interval=5\n", + " )\n", + "\n", + "print(f\"Downloaded {len(data)} products:\")\n", + "for product in data[:5]:\n", + " print(f\"\\n Title: {product.get('title', 'N/A')[:60]}...\")\n", + " print(f\" Rating: {product.get('rating', 'N/A')} ({product.get('reviews_count', 0)} reviews)\")\n", + " print(f\" Price: {product.get('currency', '')} {product.get('final_price', 'N/A')}\")\n", + " print(f\" Brand: {product.get('brand', 'N/A')}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Test 4: Filter by Price Range\n", + "\n", + "Find products in a specific price range." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Step 1: Create filter\n", + "PRICE_FILTER = {\n", + " \"operator\": \"and\",\n", + " \"filters\": [\n", + " {\"name\": \"final_price\", \"operator\": \">=\", \"value\": 50},\n", + " {\"name\": \"final_price\", \"operator\": \"<=\", \"value\": 100}\n", + " ]\n", + "}\n", + "\n", + "print(\"Filter: Products priced $50-$100\")\n", + "print(f\"Records limit: 5\\n\")\n", + "\n", + "async with client:\n", + " snapshot_id = await client.datasets.amazon_products.filter(\n", + " filter=PRICE_FILTER,\n", + " records_limit=5\n", + " )\n", + "\n", + "print(f\"Snapshot created: {snapshot_id}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Step 2: Download\n", + "print(f\"Downloading snapshot: {snapshot_id}\\n\")\n", + "\n", + "async with client:\n", + " data = await client.datasets.amazon_products.download(snapshot_id)\n", + "\n", + "print(f\"Downloaded {len(data)} products:\")\n", + "for product in data:\n", + " print(f\" - {product.get('title', 'N/A')[:50]}... - ${product.get('final_price', 'N/A')}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Test 5: Filter by Availability and Prime\n", + "\n", + "Find available Prime-eligible products." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Step 1: Create filter\n", + "PRIME_FILTER = {\n", + " \"operator\": \"and\",\n", + " \"filters\": [\n", + " {\"name\": \"is_available\", \"operator\": \"=\", \"value\": True},\n", + " {\"name\": \"amazon_prime\", \"operator\": \"=\", \"value\": True}\n", + " ]\n", + "}\n", + "\n", + "print(\"Filter: Available + Prime eligible\")\n", + "print(f\"Records limit: 5\\n\")\n", + "\n", + "async with client:\n", + " snapshot_id = await client.datasets.amazon_products.filter(\n", + " filter=PRIME_FILTER,\n", + " records_limit=5\n", + " )\n", + "\n", + "print(f\"Snapshot created: {snapshot_id}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Step 2: Download\n", + "print(f\"Downloading snapshot: {snapshot_id}\\n\")\n", + "\n", + "async with client:\n", + " data = await client.datasets.amazon_products.download(snapshot_id)\n", + "\n", + "print(f\"Downloaded {len(data)} products:\")\n", + "for product in data:\n", + " print(f\"\\n Title: {product.get('title', 'N/A')[:50]}...\")\n", + " print(f\" Available: {product.get('is_available', 'N/A')}\")\n", + " print(f\" Prime: {product.get('amazon_prime', 'N/A')}\")\n", + " print(f\" Price: ${product.get('final_price', 'N/A')}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Test 6: Filter by Brand" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Step 1: Create filter\n", + "BRAND = \"Apple\"\n", + "\n", + "BRAND_FILTER = {\n", + " \"name\": \"brand\",\n", + " \"operator\": \"=\",\n", + " \"value\": BRAND\n", + "}\n", + "\n", + "print(f\"Filter: Brand = {BRAND}\")\n", + "print(f\"Records limit: 5\\n\")\n", + "\n", + "async with client:\n", + " snapshot_id = await client.datasets.amazon_products.filter(\n", + " filter=BRAND_FILTER,\n", + " records_limit=5\n", + " )\n", + "\n", + "print(f\"Snapshot created: {snapshot_id}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Step 2: Download\n", + "print(f\"Downloading snapshot: {snapshot_id}\\n\")\n", + "\n", + "async with client:\n", + " data = await client.datasets.amazon_products.download(snapshot_id)\n", + "\n", + "print(f\"Downloaded {len(data)} products:\")\n", + "for product in data:\n", + " print(f\" - {product.get('title', 'N/A')[:60]}...\")\n", + " print(f\" Brand: {product.get('brand', 'N/A')}, Price: ${product.get('final_price', 'N/A')}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Test 7: Export Results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "from pathlib import Path\n", + "\n", + "if data:\n", + " output_file = Path.cwd() / \"amazon_dataset_results.json\"\n", + " \n", + " with open(output_file, \"w\") as f:\n", + " json.dump(data, f, indent=2, default=str)\n", + " \n", + " print(f\"Exported to: {output_file}\")\n", + " print(f\"Records: {len(data)}\")\n", + "else:\n", + " print(\"No data to export\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "from brightdata.datasets import export_json, export_csv, export\n\n# Export to JSON\njson_file = export_json(data, \"amazon_results.json\")\nprint(f\"Exported to: {json_file}\")\n\n# Export to CSV\ncsv_file = export_csv(data, \"amazon_results.csv\")\nprint(f\"Exported to: {csv_file}\")\n\n# Or use auto-detect based on extension\n# export(data, \"results.json\")\n# export(data, \"results.csv\")\n\nprint(f\"\\nRecords: {len(data)}\")" + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/datasets/crunchbase/crunchbase.ipynb b/notebooks/datasets/crunchbase/crunchbase.ipynb new file mode 100644 index 0000000..0babefd --- /dev/null +++ b/notebooks/datasets/crunchbase/crunchbase.ipynb @@ -0,0 +1,588 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 🏢 Crunchbase Companies Dataset API\n", + "\n", + "Access Bright Data's pre-collected Crunchbase Companies dataset:\n", + "- **2.3M+ companies** with **98 fields**\n", + "- Filter by funding, employees, industry, location, and more\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "API Token: 7011787d-2...3336\n", + "Setup complete!\n" + ] + } + ], + "source": [ + "import os\n", + "from dotenv import load_dotenv\n", + "load_dotenv()\n", + "\n", + "API_TOKEN = os.getenv(\"BRIGHTDATA_API_TOKEN\")\n", + "if not API_TOKEN:\n", + " raise ValueError(\"Set BRIGHTDATA_API_TOKEN in .env file\")\n", + "\n", + "print(f\"API Token: {API_TOKEN[:10]}...{API_TOKEN[-4:]}\")\n", + "print(\"Setup complete!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initialize Client" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Client initialized\n" + ] + } + ], + "source": [ + "from brightdata import BrightDataClient\n", + "\n", + "client = BrightDataClient(token=API_TOKEN)\n", + "\n", + "print(\"Client initialized\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Test 1: Explore Crunchbase Fields\n", + "\n", + "Explore the 98 available fields." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from brightdata.datasets import CrunchbaseCompanies\n", + "\n", + "print(\"=== Crunchbase Companies Dataset ===\")\n", + "print(f\"Dataset ID: {CrunchbaseCompanies.DATASET_ID}\")\n", + "print(f\"Total fields: {len(CrunchbaseCompanies.FIELDS)}\")\n", + "\n", + "# High fill rate fields (most reliable for filtering)\n", + "high_fill = CrunchbaseCompanies.get_high_fill_rate_fields(min_rate=90.0)\n", + "print(f\"\\nHigh fill rate fields (>90%): {len(high_fill)}\")\n", + "for field in high_fill[:10]:\n", + " info = CrunchbaseCompanies.FIELDS[field]\n", + " print(f\" - {field}: {info['type']} ({info['fill_rate']}%)\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Show fields by type\n", + "print(\"\\n=== Fields by Type ===\")\n", + "print(f\"Text fields: {len(CrunchbaseCompanies.get_fields_by_type('text'))}\")\n", + "print(f\"Number fields: {len(CrunchbaseCompanies.get_fields_by_type('number'))}\")\n", + "print(f\"Array fields: {len(CrunchbaseCompanies.get_fields_by_type('array'))}\")\n", + "print(f\"Object fields: {len(CrunchbaseCompanies.get_fields_by_type('object'))}\")\n", + "print(f\"URL fields: {len(CrunchbaseCompanies.get_fields_by_type('url'))}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Show number fields (useful for filtering)\n", + "print(\"\\n=== Number Fields (for numeric filtering) ===\")\n", + "for field in CrunchbaseCompanies.get_fields_by_type('number')[:15]:\n", + " info = CrunchbaseCompanies.FIELDS[field]\n", + " print(f\" - {field}: {info['description'][:50]}... ({info['fill_rate']}%)\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Test 2: Get Dataset Metadata from API" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Fetching Crunchbase metadata from API...\\n\")\n", + "\n", + "async with client:\n", + " metadata = await client.datasets.crunchbase_companies.get_metadata()\n", + "\n", + "print(f\"Dataset ID: {metadata.id}\")\n", + "print(f\"Total fields from API: {len(metadata.fields)}\")\n", + "\n", + "print(\"\\n=== Sample Fields ===\")\n", + "for i, (name, field) in enumerate(list(metadata.fields.items())[:10]):\n", + " print(f\" {name}: {field.type} - {field.description or 'N/A'}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Test 3: Filter by Operating Status\n", + "\n", + "Find active companies." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Step 1: Create filter and get snapshot_id\n", + "FILTER = {\n", + " \"name\": \"operating_status\",\n", + " \"operator\": \"=\",\n", + " \"value\": \"active\"\n", + "}\n", + "LIMIT = 10\n", + "\n", + "print(f\"Filter: {FILTER}\")\n", + "print(f\"Records limit: {LIMIT}\\n\")\n", + "\n", + "async with client:\n", + " snapshot_id = await client.datasets.crunchbase_companies.filter(\n", + " filter=FILTER,\n", + " records_limit=LIMIT\n", + " )\n", + "\n", + "print(f\"Snapshot created: {snapshot_id}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Step 2: Download data (polls until ready)\n", + "print(f\"Downloading snapshot: {snapshot_id}\\n\")\n", + "\n", + "async with client:\n", + " data = await client.datasets.crunchbase_companies.download(snapshot_id)\n", + "\n", + "print(f\"Downloaded {len(data)} companies:\")\n", + "for company in data[:5]:\n", + " print(f\"\\n Name: {company.get('name', 'N/A')}\")\n", + " print(f\" Status: {company.get('operating_status', 'N/A')}\")\n", + " print(f\" Industries: {company.get('industries', 'N/A')}\")\n", + " print(f\" Employees: {company.get('num_employees', 'N/A')}\")\n", + " print(f\" Website: {company.get('website', 'N/A')}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Step 1: Create filter\n", + "EMPLOYEE_FILTER = {\n", + " \"name\": \"num_employee_profiles\",\n", + " \"operator\": \">\",\n", + " \"value\": 100\n", + "}\n", + "\n", + "print(f\"Filter: Companies with >100 employee profiles\")\n", + "print(f\"Records limit: 5\\n\")\n", + "\n", + "async with client:\n", + " snapshot_id = await client.datasets.crunchbase_companies.filter(\n", + " filter=EMPLOYEE_FILTER,\n", + " records_limit=5\n", + " )\n", + "\n", + "print(f\"Snapshot created: {snapshot_id}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Step 2: Download data\n", + "print(f\"Downloading snapshot: {snapshot_id}\\n\")\n", + "\n", + "async with client:\n", + " data = await client.datasets.crunchbase_companies.download(snapshot_id)\n", + "\n", + "print(f\"Downloaded {len(data)} companies:\")\n", + "for company in data:\n", + " print(f\" - {company.get('name', 'N/A')}\")\n", + " print(f\" Employee profiles: {company.get('num_employee_profiles', 'N/A')}\")\n", + " print(f\" Employees: {company.get('num_employees', 'N/A')}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Step 1: Create filter\n", + "COUNTRY_FILTER = {\n", + " \"name\": \"country_code\",\n", + " \"operator\": \"=\",\n", + " \"value\": \"USA\"\n", + "}\n", + "\n", + "print(\"Filter: US-based companies\")\n", + "print(f\"Records limit: 5\\n\")\n", + "\n", + "async with client:\n", + " snapshot_id = await client.datasets.crunchbase_companies.filter(\n", + " filter=COUNTRY_FILTER,\n", + " records_limit=5\n", + " )\n", + "\n", + "print(f\"Snapshot created: {snapshot_id}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Step 2: Download data\n", + "print(f\"Downloading snapshot: {snapshot_id}\\n\")\n", + "\n", + "async with client:\n", + " data = await client.datasets.crunchbase_companies.download(snapshot_id)\n", + "\n", + "print(f\"Downloaded {len(data)} companies:\")\n", + "for company in data:\n", + " print(f\" - {company.get('name', 'N/A')} ({company.get('country_code', 'N/A')})\")\n", + " print(f\" HQ: {company.get('address', 'N/A')[:50]}...\" if company.get('address') and len(company.get('address', '')) > 50 else f\" HQ: {company.get('address', 'N/A')}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Step 1: Create filter\n", + "FUNDED_FILTER = {\n", + " \"operator\": \"and\",\n", + " \"filters\": [\n", + " {\"name\": \"operating_status\", \"operator\": \"=\", \"value\": \"active\"},\n", + " {\"name\": \"num_investors\", \"operator\": \">\", \"value\": 0}\n", + " ]\n", + "}\n", + "\n", + "print(\"Filter: Active companies with investors\")\n", + "print(f\"Records limit: 5\\n\")\n", + "\n", + "async with client:\n", + " snapshot_id = await client.datasets.crunchbase_companies.filter(\n", + " filter=FUNDED_FILTER,\n", + " records_limit=5\n", + " )\n", + "\n", + "print(f\"Snapshot created: {snapshot_id}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Step 2: Download data\n", + "print(f\"Downloading snapshot: {snapshot_id}\\n\")\n", + "\n", + "async with client:\n", + " data = await client.datasets.crunchbase_companies.download(snapshot_id)\n", + "\n", + "print(f\"Downloaded {len(data)} companies:\")\n", + "for company in data:\n", + " print(f\"\\n Name: {company.get('name', 'N/A')}\")\n", + " print(f\" Status: {company.get('operating_status', 'N/A')}\")\n", + " print(f\" Investors: {company.get('num_investors', 'N/A')}\")\n", + " print(f\" Industries: {company.get('industries', 'N/A')}\")\n", + " print(f\" CB Rank: {company.get('cb_rank', 'N/A')}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Step 1: Create filter\n", + "IPO_FILTER = {\n", + " \"name\": \"ipo_status\",\n", + " \"operator\": \"=\",\n", + " \"value\": \"public\"\n", + "}\n", + "\n", + "print(\"Filter: Public companies\")\n", + "print(f\"Records limit: 5\\n\")\n", + "\n", + "async with client:\n", + " snapshot_id = await client.datasets.crunchbase_companies.filter(\n", + " filter=IPO_FILTER,\n", + " records_limit=5\n", + " )\n", + "\n", + "print(f\"Snapshot created: {snapshot_id}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Step 2: Download data\n", + "print(f\"Downloading snapshot: {snapshot_id}\\n\")\n", + "\n", + "async with client:\n", + " data = await client.datasets.crunchbase_companies.download(snapshot_id)\n", + "\n", + "print(f\"Downloaded {len(data)} companies:\")\n", + "for company in data:\n", + " print(f\" - {company.get('name', 'N/A')}\")\n", + " print(f\" IPO Status: {company.get('ipo_status', 'N/A')}\")\n", + " print(f\" Stock Symbol: {company.get('stock_symbol', 'N/A')}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "from brightdata.datasets import export_json, export_csv, export\n\n# Export to JSON\njson_file = export_json(data, \"crunchbase_results.json\")\nprint(f\"Exported to: {json_file}\")\n\n# Export to CSV\ncsv_file = export_csv(data, \"crunchbase_results.csv\")\nprint(f\"Exported to: {csv_file}\")\n\n# Or use auto-detect based on extension\n# export(data, \"results.json\")\n# export(data, \"results.csv\")\n\nprint(f\"\\nRecords: {len(data)}\")" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Combined filter: active companies with investors\n", + "FUNDED_FILTER = {\n", + " \"operator\": \"and\",\n", + " \"filters\": [\n", + " {\"name\": \"operating_status\", \"operator\": \"=\", \"value\": \"active\"},\n", + " {\"name\": \"num_investors\", \"operator\": \">\", \"value\": 0}\n", + " ]\n", + "}\n", + "\n", + "print(\"Filter: Active companies with investors\")\n", + "print(f\"Records limit: 5\\n\")\n", + "\n", + "async with client:\n", + " snapshot_id = await client.datasets.crunchbase_companies.filter(\n", + " filter=FUNDED_FILTER,\n", + " records_limit=5\n", + " )\n", + " print(f\"Snapshot: {snapshot_id}\")\n", + " \n", + " data = await client.datasets.crunchbase_companies.download(snapshot_id)\n", + "\n", + "print(f\"\\nDownloaded {len(data)} companies:\")\n", + "for company in data:\n", + " print(f\"\\n Name: {company.get('name', 'N/A')}\")\n", + " print(f\" Status: {company.get('operating_status', 'N/A')}\")\n", + " print(f\" Investors: {company.get('num_investors', 'N/A')}\")\n", + " print(f\" Industries: {company.get('industries', 'N/A')}\")\n", + " print(f\" CB Rank: {company.get('cb_rank', 'N/A')}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Test 7: Filter by IPO Status" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Filter: public companies\n", + "IPO_FILTER = {\n", + " \"name\": \"ipo_status\",\n", + " \"operator\": \"=\",\n", + " \"value\": \"public\"\n", + "}\n", + "\n", + "print(\"Filter: Public companies\")\n", + "print(f\"Records limit: 5\\n\")\n", + "\n", + "async with client:\n", + " snapshot_id = await client.datasets.crunchbase_companies.filter(\n", + " filter=IPO_FILTER,\n", + " records_limit=5\n", + " )\n", + " print(f\"Snapshot: {snapshot_id}\")\n", + " \n", + " data = await client.datasets.crunchbase_companies.download(snapshot_id)\n", + "\n", + "print(f\"\\nDownloaded {len(data)} companies:\")\n", + "for company in data:\n", + " print(f\" - {company.get('name', 'N/A')}\")\n", + " print(f\" IPO Status: {company.get('ipo_status', 'N/A')}\")\n", + " print(f\" Stock Symbol: {company.get('stock_symbol', 'N/A')}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Test 8: Export Results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "from pathlib import Path\n", + "\n", + "if data:\n", + " output_file = Path.cwd() / \"crunchbase_dataset_results.json\"\n", + " \n", + " with open(output_file, \"w\") as f:\n", + " json.dump(data, f, indent=2, default=str)\n", + " \n", + " print(f\"Exported to: {output_file}\")\n", + " print(f\"Records: {len(data)}\")\n", + "else:\n", + " print(\"No data to export\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Summary\n", + "\n", + "### Crunchbase Companies Dataset\n", + "\n", + "| Info | Value |\n", + "|------|-------|\n", + "| Dataset ID | `gd_l1vijqt9jfj7olije` |\n", + "| Records | 2.3M+ companies |\n", + "| Total Fields | 98 |\n", + "| Access | `client.datasets.crunchbase_companies` |\n", + "\n", + "### Key Fields for Filtering\n", + "\n", + "| Field | Type | Fill Rate | Description |\n", + "|-------|------|-----------|-------------|\n", + "| `name` | text | 100% | Company name |\n", + "| `operating_status` | text | 100% | active, closed, etc. |\n", + "| `ipo_status` | text | 99.9% | public, private, etc. |\n", + "| `country_code` | text | 93.5% | Country code |\n", + "| `cb_rank` | number | 97% | Crunchbase rank |\n", + "| `num_employees` | text | 86.3% | Employee count range |\n", + "| `num_employee_profiles` | number | 99.9% | LinkedIn profiles |\n", + "| `num_investors` | number | 8.2% | Investor count |\n", + "| `industries` | array | 94.5% | Industry categories |\n", + "\n", + "### Example Filters\n", + "\n", + "```python\n", + "# Active companies\n", + "{\"name\": \"operating_status\", \"operator\": \"=\", \"value\": \"active\"}\n", + "\n", + "# Public companies\n", + "{\"name\": \"ipo_status\", \"operator\": \"=\", \"value\": \"public\"}\n", + "\n", + "# US-based companies\n", + "{\"name\": \"country_code\", \"operator\": \"=\", \"value\": \"USA\"}\n", + "\n", + "# Companies with funding\n", + "{\n", + " \"operator\": \"and\",\n", + " \"filters\": [\n", + " {\"name\": \"operating_status\", \"operator\": \"=\", \"value\": \"active\"},\n", + " {\"name\": \"num_investors\", \"operator\": \">\", \"value\": 0}\n", + " ]\n", + "}\n", + "```\n", + "\n", + "### Helper Methods\n", + "\n", + "| Method | Description |\n", + "|--------|-------------|\n", + "| `get_field_names()` | List all 98 field names |\n", + "| `get_high_fill_rate_fields(min_rate)` | Fields above fill rate threshold |\n", + "| `get_fields_by_type(type)` | Fields of specific type |" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/datasets/linkedin/linkedin.ipynb b/notebooks/datasets/linkedin/linkedin.ipynb new file mode 100644 index 0000000..d439317 --- /dev/null +++ b/notebooks/datasets/linkedin/linkedin.ipynb @@ -0,0 +1,754 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 📊 LinkedIn Datasets API\n", + "\n", + "Access Bright Data's pre-collected LinkedIn datasets:\n", + "- **LinkedIn People Profiles**: 620M+ profiles with 42 fields\n", + "- **LinkedIn Company Profiles**: 58.5M+ companies with 36 fields\n", + "\n", + "Unlike web scrapers that collect data on-demand, datasets provide instant access to pre-collected, structured data filtered by your criteria.\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "API Token: 7011787d-2...3336\n", + "Setup complete!\n" + ] + } + ], + "source": [ + "import os\n", + "from dotenv import load_dotenv\n", + "load_dotenv()\n", + "\n", + "API_TOKEN = os.getenv(\"BRIGHTDATA_API_TOKEN\")\n", + "if not API_TOKEN:\n", + " raise ValueError(\"Set BRIGHTDATA_API_TOKEN in .env file\")\n", + "\n", + "print(f\"API Token: {API_TOKEN[:10]}...{API_TOKEN[-4:]}\")\n", + "print(\"Setup complete!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initialize Client" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Client initialized\n", + "Available datasets: linkedin_profiles, linkedin_companies, amazon_products, crunchbase_companies\n" + ] + } + ], + "source": [ + "from brightdata import BrightDataClient\n", + "\n", + "client = BrightDataClient(token=API_TOKEN)\n", + "\n", + "print(\"Client initialized\")\n", + "print(f\"Available datasets: linkedin_profiles, linkedin_companies, amazon_products, crunchbase_companies\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Test 1: List Available Datasets\n", + "\n", + "List all datasets available in your account." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fetching available datasets...\n", + "\n", + "Found 174 datasets:\n", + "\n", + " - Crunchbase companies information\n", + " ID: gd_l1vijqt9jfj7olije\n", + " Size: 2.3M records\n", + "\n", + " - Instagram - Profiles\n", + " ID: gd_l1vikfch901nx3by4\n", + " Size: 620.0M records\n", + "\n", + " - Manta businesses \n", + " ID: gd_l1vil1d81g0u8763b2\n", + " Size: 5.6M records\n", + "\n", + " - US lawyers directory\n", + " ID: gd_l1vil5n11okchcbvax\n", + " Size: 1.4M records\n", + "\n", + " - LinkedIn company information\n", + " ID: gd_l1vikfnt1wgvvqz95w\n", + " Size: 55.0M records\n", + "\n", + " - LinkedIn people profiles\n", + " ID: gd_l1viktl72bvl7bjuj0\n", + " Size: 115.0M records\n", + "\n", + " - TikTok - Profiles\n", + " ID: gd_l1villgoiiidt09ci\n", + " Size: 152.0M records\n", + "\n", + " - Slintel 6sense company information\n", + " ID: gd_l1vilg5a1decoahvgq\n", + " Size: 10.9M records\n", + "\n", + " - Owler companies information\n", + " ID: gd_l1vilaxi10wutoage7\n", + " Size: 6.1M records\n", + "\n", + " - VentureRadar company information\n", + " ID: gd_l1vilsfd1xpsndbtpr\n", + " Size: 0.3M records\n", + "\n" + ] + } + ], + "source": [ + "print(\"Fetching available datasets...\\n\")\n", + "\n", + "async with client:\n", + " datasets = await client.datasets.list()\n", + "\n", + "print(f\"Found {len(datasets)} datasets:\\n\")\n", + "for ds in datasets[:10]: # Show first 10\n", + " size_m = ds.size / 1_000_000 if ds.size else 0\n", + " print(f\" - {ds.name}\")\n", + " print(f\" ID: {ds.id}\")\n", + " print(f\" Size: {size_m:.1f}M records\")\n", + " print()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Test 2: Explore LinkedIn Profiles Fields\n", + "\n", + "Before filtering, explore available fields using the class metadata." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== LinkedIn People Profiles Dataset ===\n", + "Dataset ID: gd_l1viktl72bvl7bjuj0\n", + "Total fields: 42\n", + "\n", + "High fill rate fields (>70%): 19\n", + " - id: text (100.0%)\n", + " A unique identifier for the person's LinkedIn profile\n", + " - name: text (97.54%)\n", + " Profile name\n", + " - first_name: text (95.1%)\n", + " First name of the user\n", + " - last_name: text (94.8%)\n", + " Last name of the user\n", + " - city: text (96.3%)\n", + " Geographical location of the user\n", + " - country_code: text (97.11%)\n", + " Geographical location of the user\n", + " - position: text (91.23%)\n", + " The current job title or position of the profile\n", + " - url: url (100.0%)\n", + " URL that links directly to the LinkedIn profile\n", + " - input_url: url (100.0%)\n", + " The URL that was entered when starting the scraping process\n", + " - linkedin_id: text (100.0%)\n", + " LinkedIn profile identifier\n", + " - linkedin_num_id: text (100.0%)\n", + " Numeric LinkedIn profile ID\n", + " - avatar: url (96.28%)\n", + " URL that links to the profile picture of the LinkedIn user\n", + " - banner_image: url (96.28%)\n", + " Banner image URL\n", + " - default_avatar: boolean (95.73%)\n", + " Is the avatar picture the default empty picture\n", + " - followers: number (71.39%)\n", + " How many users/companies following the profile\n", + " - connections: number (70.33%)\n", + " How many connections the profile has\n", + " - memorialized_account: boolean (99.44%)\n", + " Boolean indicating if the account is memorialized\n", + " - current_company: object (100.0%)\n", + " Current professional position info: company name, job title, company ID, industry\n", + " - experience: array (71.49%)\n", + " Professional history: job titles, dates, companies, locations\n" + ] + } + ], + "source": [ + "from brightdata.datasets import LinkedInPeopleProfiles\n", + "\n", + "print(\"=== LinkedIn People Profiles Dataset ===\")\n", + "print(f\"Dataset ID: {LinkedInPeopleProfiles.DATASET_ID}\")\n", + "print(f\"Total fields: {len(LinkedInPeopleProfiles.FIELDS)}\")\n", + "\n", + "# Get high fill rate fields (more reliable for filtering)\n", + "high_fill = LinkedInPeopleProfiles.get_high_fill_rate_fields(min_rate=70.0)\n", + "print(f\"\\nHigh fill rate fields (>70%): {len(high_fill)}\")\n", + "for field in high_fill:\n", + " info = LinkedInPeopleProfiles.FIELDS[field]\n", + " print(f\" - {field}: {info['type']} ({info['fill_rate']}%)\")\n", + " print(f\" {info['description']}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "=== All Available Fields ===\n", + " id: text - 100.0%\n", + " name: text - 97.54%\n", + " first_name: text - 95.1%\n", + " last_name: text - 94.8%\n", + " city: text - 96.3%\n", + " country_code: text - 97.11%\n", + " location: text - 61.93%\n", + " position: text - 91.23%\n", + " about: text - 18.9%\n", + " url: url - 100.0%\n", + " input_url: url - 100.0%\n", + " linkedin_id: text - 100.0%\n", + " linkedin_num_id: text - 100.0%\n", + " avatar: url - 96.28%\n", + " banner_image: url - 96.28%\n", + " default_avatar: boolean - 95.73%\n", + " followers: number - 71.39%\n", + " connections: number - 70.33%\n", + " recommendations_count: number - 3.65%\n", + " influencer: boolean - 46.06%\n", + " memorialized_account: boolean - 99.44%\n", + " current_company_name: text - 69.6%\n", + " current_company_company_id: text - 38.94%\n", + " current_company: object - 100.0%\n", + " experience: array - 71.49%\n", + " education: array - 41.97%\n", + " educations_details: text - 42.08%\n", + " posts: array - 1.27%\n", + " activity: array - 32.95%\n", + " certifications: array - 8.35%\n", + " courses: array - 2.55%\n", + " languages: array - 9.19%\n", + " publications: array - 1.23%\n", + " patents: array - 0.13%\n", + " projects: array - 2.08%\n", + " honors_and_awards: array - 2.13%\n", + " recommendations: array - 3.61%\n", + " volunteer_experience: array - 4.12%\n", + " organizations: array - 1.78%\n", + " people_also_viewed: array - 33.36%\n", + " similar_profiles: array - 0.58%\n", + " bio_links: array - 2.96%\n" + ] + } + ], + "source": [ + "# Show all available field names\n", + "print(\"\\n=== All Available Fields ===\")\n", + "for name, info in LinkedInPeopleProfiles.FIELDS.items():\n", + " print(f\" {name}: {info['type']} - {info.get('fill_rate', 'N/A')}%\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Test 3: Get Dataset Metadata from API\n", + "\n", + "Fetch live metadata from the API to see current field schema." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fetching LinkedIn Profiles metadata from API...\n", + "\n", + "Dataset ID: gd_l1viktl72bvl7bjuj0\n", + "Total fields from API: 45\n", + "\n", + "=== Sample Fields ===\n", + " id:\n", + " type: text\n", + " active: True\n", + " description: A unique identifier for the person's LinkedIn profile\n", + " name:\n", + " type: text\n", + " active: True\n", + " description: Profile name\n", + " city:\n", + " type: text\n", + " active: True\n", + " description: Geographical location of the user\n", + " country_code:\n", + " type: text\n", + " active: True\n", + " description: Geographical location of the user\n", + " position:\n", + " type: text\n", + " active: True\n", + " description: The current job title or position of the profile\n", + " about:\n", + " type: text\n", + " active: True\n", + " description: A concise profile summary. In some cases, only a truncated version with \"…\" is displayed on the website, and this is the version we capture\n", + " posts:\n", + " type: array\n", + " active: True\n", + " description: Contains information related to the user's last LinkedIn posts. It typically includes the post title, created date, URL link to the post, etc.\n", + " groups:\n", + " type: array\n", + " active: False\n", + " description: The LinkedIn groups that the profile is a part of\n", + " current_company:\n", + " type: object\n", + " active: True\n", + " description: Provides information about the user's current professional position. It typically includes the company name, the user's job title, the company ID, and the industry or sector to which the company belongs\n", + " experience:\n", + " type: array\n", + " active: True\n", + " description: Contains information about user's professional history. It typically includes the user's job title, length of time the user held the position, the geographic location of the company, the start and end date, the company name, URL link to the company profile, etc.\n" + ] + } + ], + "source": [ + "print(\"Fetching LinkedIn Profiles metadata from API...\\n\")\n", + "\n", + "async with client:\n", + " metadata = await client.datasets.linkedin_profiles.get_metadata()\n", + "\n", + "print(f\"Dataset ID: {metadata.id}\")\n", + "print(f\"Total fields from API: {len(metadata.fields)}\")\n", + "\n", + "print(\"\\n=== Sample Fields ===\")\n", + "for i, (name, field) in enumerate(list(metadata.fields.items())[:10]):\n", + " print(f\" {name}:\")\n", + " print(f\" type: {field.type}\")\n", + " print(f\" active: {field.active}\")\n", + " print(f\" description: {field.description or 'N/A'}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Test 4: Filter Dataset (Simple Filter)\n", + "\n", + "Filter profiles by a single criterion. Returns a snapshot_id for later download." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Filter: {'name': 'followers', 'operator': '>', 'value': 10000}\n", + "Records limit: 2\n", + "\n", + "Snapshot created: snap_mlev60jlf03ta3ev\n", + "\n", + "Note: filter() returns immediately with a snapshot_id.\n", + "The snapshot is built asynchronously - use get_status() or download() next.\n" + ] + } + ], + "source": [ + "# Simple filter: profiles with 10,000+ followers\n", + "FILTER = {\n", + " \"name\": \"followers\",\n", + " \"operator\": \">\",\n", + " \"value\": 10000\n", + "}\n", + "LIMIT = 2 # Only get 2 records for demo\n", + "\n", + "print(f\"Filter: {FILTER}\")\n", + "print(f\"Records limit: {LIMIT}\\n\")\n", + "\n", + "async with client:\n", + " snapshot_id = await client.datasets.linkedin_profiles.filter(\n", + " filter=FILTER,\n", + " records_limit=LIMIT\n", + " )\n", + "\n", + "print(f\"Snapshot created: {snapshot_id}\")\n", + "print(\"\\nNote: filter() returns immediately with a snapshot_id.\")\n", + "print(\"The snapshot is built asynchronously - use get_status() or download() next.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Test 5: Check Snapshot Status\n", + "\n", + "Check the status of a snapshot before downloading." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Checking status for snapshot: snap_mlev60jlf03ta3ev\n", + "\n", + "=== Snapshot Status ===\n", + "ID: snap_mlev60jlf03ta3ev\n", + "Status: ready\n", + "Dataset ID: gd_l1viktl72bvl7bjuj0\n", + "Records: 2\n", + "File size: 21733 bytes\n", + "Cost: $0\n" + ] + } + ], + "source": [ + "print(f\"Checking status for snapshot: {snapshot_id}\\n\")\n", + "\n", + "async with client:\n", + " status = await client.datasets.linkedin_profiles.get_status(snapshot_id)\n", + "\n", + "print(f\"=== Snapshot Status ===\")\n", + "print(f\"ID: {status.id}\")\n", + "print(f\"Status: {status.status}\")\n", + "print(f\"Dataset ID: {status.dataset_id}\")\n", + "print(f\"Records: {status.dataset_size}\")\n", + "print(f\"File size: {status.file_size} bytes\")\n", + "print(f\"Cost: ${status.cost}\")\n", + "\n", + "if status.error:\n", + " print(f\"Error: {status.error}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Test 6: Download Snapshot Data\n", + "\n", + "Download the filtered data. This polls until ready, then returns the records." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "snapshot_id=\"snap_mlev60jlf03ta3ev\"" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading snapshot: snap_mlev60jlf03ta3ev\n", + "(This will poll until ready...)\n", + "\n", + "Downloaded 2 profiles\n", + "\n", + "=== Profile 1 ===\n", + " Name: Jacques Wakefield\n", + " Position: Affiliate Marketer\n", + " City: Jackson, Tennessee, United States\n", + " Country: US\n", + " Followers: 15700\n", + " Connections: 500\n", + " URL: https://linkedin.com/in/jacqueswakefield\n", + "\n", + "=== Profile 2 ===\n", + " Name: Ajay Anand\n", + " Position: Ajay Anand, EY Global Vice Chair, Global Delivery Services |Innovator | Technologist | Board Advisor\n", + " City: San Francisco Bay Area\n", + " Country: US\n", + " Followers: 10649\n", + " Connections: 500\n", + " URL: https://ae.linkedin.com/in/ajay-anand-1912512\n", + "\n" + ] + } + ], + "source": [ + "print(f\"Downloading snapshot: {snapshot_id}\")\n", + "print(\"(This will poll until ready...)\\n\")\n", + "\n", + "async with client:\n", + " data = await client.datasets.linkedin_profiles.download(\n", + " snapshot_id,\n", + " format=\"jsonl\",\n", + " timeout=300, # 5 minutes\n", + " poll_interval=5 # Check every 5 seconds\n", + " )\n", + "\n", + "print(f\"Downloaded {len(data)} profiles\\n\")\n", + "\n", + "# Display first few profiles\n", + "for i, profile in enumerate(data[:3]):\n", + " print(f\"=== Profile {i+1} ===\")\n", + " print(f\" Name: {profile.get('name', 'N/A')}\")\n", + " print(f\" Position: {profile.get('position', 'N/A')}\")\n", + " print(f\" City: {profile.get('city', 'N/A')}\")\n", + " print(f\" Country: {profile.get('country_code', 'N/A')}\")\n", + " print(f\" Followers: {profile.get('followers', 'N/A')}\")\n", + " print(f\" Connections: {profile.get('connections', 'N/A')}\")\n", + " print(f\" URL: {profile.get('url', 'N/A')}\")\n", + " print()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Test 7: Combined Filter (AND/OR)\n", + "\n", + "Filter with multiple conditions using AND/OR operators." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": "# Step 1: Create filter\nCOMBINED_FILTER = {\n \"operator\": \"and\",\n \"filters\": [\n {\"name\": \"country_code\", \"operator\": \"=\", \"value\": \"US\"},\n {\"name\": \"followers\", \"operator\": \">\", \"value\": 5000}\n ]\n}\n\nprint(\"Filter: US-based profiles with 5000+ followers\")\nprint(f\"Records limit: 5\\n\")\n\nasync with client:\n snapshot_id = await client.datasets.linkedin_profiles.filter(\n filter=COMBINED_FILTER,\n records_limit=5\n )\n\nprint(f\"Snapshot created: {snapshot_id}\")" + }, + { + "cell_type": "code", + "source": "# Step 2: Download data\nprint(f\"Downloading snapshot: {snapshot_id}\\n\")\n\nasync with client:\n data = await client.datasets.linkedin_profiles.download(snapshot_id)\n\nprint(f\"Downloaded {len(data)} profiles:\")\nfor profile in data:\n print(f\" - {profile.get('name', 'N/A')} ({profile.get('country_code', 'N/A')}) - {profile.get('followers', 0)} followers\")", + "metadata": {}, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Test 8: LinkedIn Company Profiles\n", + "\n", + "Access the LinkedIn Company Profiles dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": "# Step 1: Create filter\nCOMPANY_FILTER = {\n \"name\": \"company_size\",\n \"operator\": \"=\",\n \"value\": \"1001-5000 employees\"\n}\n\nprint(f\"Filter: {COMPANY_FILTER}\")\nprint(f\"Records limit: 5\\n\")\n\nasync with client:\n snapshot_id = await client.datasets.linkedin_companies.filter(\n filter=COMPANY_FILTER,\n records_limit=5\n )\n\nprint(f\"Snapshot created: {snapshot_id}\")" + }, + { + "cell_type": "code", + "source": "# Step 2: Download data\nprint(f\"Downloading snapshot: {snapshot_id}\\n\")\n\nasync with client:\n data = await client.datasets.linkedin_companies.download(snapshot_id)\n\nprint(f\"Downloaded {len(data)} companies:\")\nfor company in data:\n print(f\"\\n=== {company.get('name', 'N/A')} ===\")\n print(f\" Industry: {company.get('industries', 'N/A')}\")\n print(f\" Size: {company.get('company_size', 'N/A')}\")\n print(f\" HQ: {company.get('headquarters', 'N/A')}\")\n print(f\" Website: {company.get('website', 'N/A')}\")\n print(f\" Followers: {company.get('followers', 'N/A')}\")", + "metadata": {}, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": "from brightdata.datasets import export_json, export_csv, export\n\n# Export to JSON\njson_file = export_json(data, \"linkedin_results.json\")\nprint(f\"Exported to: {json_file}\")\n\n# Export to CSV\ncsv_file = export_csv(data, \"linkedin_results.csv\")\nprint(f\"Exported to: {csv_file}\")\n\n# Or use auto-detect based on extension\n# export(data, \"results.json\")\n# export(data, \"results.csv\")\n\nprint(f\"\\nRecords: {len(data)}\")" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Test 9: Export Results to JSON" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "from pathlib import Path\n", + "\n", + "if data:\n", + " output_file = Path.cwd() / \"linkedin_dataset_results.json\"\n", + " \n", + " with open(output_file, \"w\") as f:\n", + " json.dump(data, f, indent=2, default=str)\n", + " \n", + " print(f\"Exported to: {output_file}\")\n", + " print(f\"Records: {len(data)}\")\n", + "else:\n", + " print(\"No data to export\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Summary\n", + "\n", + "### Datasets vs Web Scrapers\n", + "\n", + "| Feature | Datasets | Web Scrapers |\n", + "|---------|----------|-------------|\n", + "| Data source | Pre-collected database | Live scraping |\n", + "| Speed | Instant filtering | Real-time collection |\n", + "| Use case | Bulk data, analytics | Specific URLs, fresh data |\n", + "| Pricing | Per record filtered | Per request |\n", + "\n", + "### Available LinkedIn Datasets\n", + "\n", + "| Dataset | Records | Fields | Access |\n", + "|---------|---------|--------|--------|\n", + "| LinkedIn People Profiles | 620M+ | 42 | `client.datasets.linkedin_profiles` |\n", + "| LinkedIn Company Profiles | 58.5M+ | 36 | `client.datasets.linkedin_companies` |\n", + "\n", + "### Dataset Methods\n", + "\n", + "| Method | Description |\n", + "|--------|-------------|\n", + "| `get_metadata()` | Get field schema from API |\n", + "| `filter(filter, records_limit)` | Create filtered snapshot (returns snapshot_id) |\n", + "| `get_status(snapshot_id)` | Check snapshot status |\n", + "| `download(snapshot_id)` | Poll and download data |\n", + "\n", + "### Filter Operators\n", + "\n", + "| Operator | Description | Example |\n", + "|----------|-------------|---------|\n", + "| `=` | Equal to | `{\"name\": \"country_code\", \"operator\": \"=\", \"value\": \"US\"}` |\n", + "| `!=` | Not equal | `{\"name\": \"country_code\", \"operator\": \"!=\", \"value\": \"CN\"}` |\n", + "| `>`, `<`, `>=`, `<=` | Numeric comparison | `{\"name\": \"followers\", \"operator\": \">\", \"value\": 10000}` |\n", + "| `in` | Value in list | `{\"name\": \"country_code\", \"operator\": \"in\", \"value\": [\"US\", \"UK\"]}` |\n", + "| `includes` | Text contains | `{\"name\": \"position\", \"operator\": \"includes\", \"value\": \"Engineer\"}` |\n", + "| `is_null` | Field is null | `{\"name\": \"about\", \"operator\": \"is_null\"}` |\n", + "| `is_not_null` | Field is not null | `{\"name\": \"about\", \"operator\": \"is_not_null\"}` |\n", + "\n", + "### Combined Filters\n", + "\n", + "```python\n", + "# AND condition\n", + "{\n", + " \"operator\": \"and\",\n", + " \"filters\": [\n", + " {\"name\": \"country_code\", \"operator\": \"=\", \"value\": \"US\"},\n", + " {\"name\": \"followers\", \"operator\": \">\", \"value\": 5000}\n", + " ]\n", + "}\n", + "\n", + "# OR condition\n", + "{\n", + " \"operator\": \"or\",\n", + " \"filters\": [\n", + " {\"name\": \"country_code\", \"operator\": \"=\", \"value\": \"US\"},\n", + " {\"name\": \"country_code\", \"operator\": \"=\", \"value\": \"UK\"}\n", + " ]\n", + "}\n", + "```\n", + "\n", + "### Class Helper Methods\n", + "\n", + "| Method | Description |\n", + "|--------|-------------|\n", + "| `get_field_names()` | List all field names |\n", + "| `get_high_fill_rate_fields(min_rate)` | Fields with fill rate above threshold |" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/pyproject.toml b/pyproject.toml index 654a9dd..adc068b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ where = ["src"] [project] name = "brightdata-sdk" -version = "2.1.2" +version = "2.2.0" description = "Modern async-first Python SDK for Bright Data APIs" authors = [{name = "Bright Data", email = "support@brightdata.com"}] license = {text = "MIT"} diff --git a/requirements.txt b/requirements.txt index 314c9e8..ba3dedb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,3 @@ tldextract>=5.0.0 pydantic>=2.0.0 pydantic-settings>=2.0.0 click>=8.1.0 - diff --git a/src/brightdata/cli/README.md b/src/brightdata/cli/README.md index 989b2d6..b0fdce0 100644 --- a/src/brightdata/cli/README.md +++ b/src/brightdata/cli/README.md @@ -195,4 +195,3 @@ brightdata scrape --help brightdata scrape amazon --help brightdata search --help ``` - diff --git a/src/brightdata/cli/banner.py b/src/brightdata/cli/banner.py index af63dd5..05111bf 100644 --- a/src/brightdata/cli/banner.py +++ b/src/brightdata/cli/banner.py @@ -40,37 +40,37 @@ def get_banner() -> str: Formatted banner string with colors """ banner = """ - + \033[1;33m██████╗ ██████╗ ██╗ ██████╗ ██╗ ██╗████████╗\033[0m \033[1;33m██╔══██╗██╔══██╗██║██╔════╝ ██║ ██║╚══██╔══╝\033[0m \033[1;33m██████╔╝██████╔╝██║██║ ███╗███████║ ██║ \033[0m \033[1;33m██╔══██╗██╔══██╗██║██║ ██║██╔══██║ ██║ \033[0m \033[1;33m██████╔╝██║ ██║██║╚██████╔╝██║ ██║ ██║ \033[0m \033[1;33m╚═════╝ ╚═╝ ╚═╝╚═╝ ╚═════╝ ╚═╝ ╚═╝ ╚═╝ \033[0m - + \033[1;35m██████╗ █████╗ ████████╗ █████╗ \033[0m \033[1;35m██╔══██╗██╔══██╗╚══██╔══╝██╔══██╗\033[0m \033[1;35m██║ ██║███████║ ██║ ███████║\033[0m \033[1;35m██║ ██║██╔══██║ ██║ ██╔══██║\033[0m \033[1;35m██████╔╝██║ ██║ ██║ ██║ ██║\033[0m \033[1;35m╚═════╝ ╚═╝ ╚═╝ ╚═╝ ╚═╝ ╚═╝\033[0m - + \033[1;32m██████╗ ██╗ ██╗████████╗██╗ ██╗ ██████╗ ███╗ ██╗\033[0m \033[1;32m██╔══██╗╚██╗ ██╔╝╚══██╔══╝██║ ██║██╔═══██╗████╗ ██║\033[0m \033[1;32m██████╔╝ ╚████╔╝ ██║ ███████║██║ ██║██╔██╗ ██║\033[0m \033[1;32m██╔═══╝ ╚██╔╝ ██║ ██╔══██║██║ ██║██║╚██╗██║\033[0m \033[1;32m██║ ██║ ██║ ██║ ██║╚██████╔╝██║ ╚████║\033[0m \033[1;32m╚═╝ ╚═╝ ╚═╝ ╚═╝ ╚═╝ ╚═════╝ ╚═╝ ╚═══╝\033[0m - + \033[1;37m███████╗██████╗ ██╗ ██╗\033[0m \033[1;37m██╔════╝██╔══██╗██║ ██╔╝\033[0m \033[1;37m███████╗██║ ██║█████╔╝ \033[0m \033[1;37m╚════██║██║ ██║██╔═██╗ \033[0m \033[1;37m███████║██████╔╝██║ ██╗\033[0m \033[1;37m╚══════╝╚═════╝ ╚═╝ ╚═╝\033[0m - + \033[1;93m🐍\033[0m - + """ return banner diff --git a/src/brightdata/client.py b/src/brightdata/client.py index 253ae5d..0f69649 100644 --- a/src/brightdata/client.py +++ b/src/brightdata/client.py @@ -27,6 +27,7 @@ from .api.scrape_service import ScrapeService from .api.search_service import SearchService from .api.crawler_service import CrawlerService +from .datasets import DatasetsClient from .models import ScrapeResult from .types import AccountInfo from .constants import ( @@ -131,6 +132,7 @@ def __init__( self._search_service: Optional[SearchService] = None self._crawler_service: Optional[CrawlerService] = None self._web_unlocker_service: Optional[WebUnlockerService] = None + self._datasets_client: Optional[DatasetsClient] = None self._zone_manager: Optional[ZoneManager] = None self._is_connected = False self._account_info: Optional[Dict[str, Any]] = None @@ -282,6 +284,35 @@ def crawler(self) -> CrawlerService: self._crawler_service = CrawlerService(self) return self._crawler_service + @property + def datasets(self) -> DatasetsClient: + """ + Access pre-collected datasets. + + Provides access to Bright Data's datasets with filtering capabilities: + - client.datasets.list() + - client.datasets.linkedin_profiles.get_metadata() + - client.datasets.linkedin_profiles.filter(...) + - client.datasets.linkedin_profiles.download(snapshot_id) + + Returns: + DatasetsClient instance for dataset operations + + Example: + >>> # List available datasets + >>> datasets = await client.datasets.list() + >>> + >>> # Filter LinkedIn profiles + >>> snapshot_id = await client.datasets.linkedin_profiles.filter( + ... filter={"name": "industry", "operator": "=", "value": "Technology"}, + ... records_limit=100 + ... ) + >>> data = await client.datasets.linkedin_profiles.download(snapshot_id) + """ + if self._datasets_client is None: + self._datasets_client = DatasetsClient(self.engine) + return self._datasets_client + async def test_connection(self) -> bool: """ Test API connection and token validity. diff --git a/src/brightdata/datasets/__init__.py b/src/brightdata/datasets/__init__.py new file mode 100644 index 0000000..f170e23 --- /dev/null +++ b/src/brightdata/datasets/__init__.py @@ -0,0 +1,52 @@ +""" +Bright Data Datasets API client. + +Access pre-collected datasets and filter records. +""" + +from .client import DatasetsClient +from .base import BaseDataset, DatasetError +from .models import DatasetInfo, DatasetField, DatasetMetadata, SnapshotStatus +from .utils import export, export_json, export_jsonl, export_csv + +# Platform-specific datasets +from .linkedin import LinkedInPeopleProfiles, LinkedInCompanyProfiles +from .amazon import AmazonProducts +from .crunchbase import CrunchbaseCompanies +from .imdb import IMDBMovies +from .nba import NBAPlayersStats +from .goodreads import GoodreadsBooks +from .world_population import WorldPopulation + +__all__ = [ + # Client + "DatasetsClient", + # Base + "BaseDataset", + "DatasetError", + # Models + "DatasetInfo", + "DatasetField", + "DatasetMetadata", + "SnapshotStatus", + # Utils + "export", + "export_json", + "export_jsonl", + "export_csv", + # LinkedIn + "LinkedInPeopleProfiles", + "LinkedInCompanyProfiles", + # Amazon + "AmazonProducts", + # Crunchbase + "CrunchbaseCompanies", + # IMDB + "IMDBMovies", + # NBA + "NBAPlayersStats", + # Goodreads + "GoodreadsBooks", + # World Population + "WorldPopulation", +] diff --git a/src/brightdata/datasets/amazon/__init__.py b/src/brightdata/datasets/amazon/__init__.py new file mode 100644 index 0000000..75ab6ed --- /dev/null +++ b/src/brightdata/datasets/amazon/__init__.py @@ -0,0 +1,5 @@ +"""Amazon datasets.""" + +from .products import AmazonProducts + +__all__ = ["AmazonProducts"] diff --git a/src/brightdata/datasets/amazon/products.py b/src/brightdata/datasets/amazon/products.py new file mode 100644 index 0000000..b2680d7 --- /dev/null +++ b/src/brightdata/datasets/amazon/products.py @@ -0,0 +1,412 @@ +""" +Amazon Products dataset. + +Dataset ID: gd_l7q7dkf244hwjntr0 + +See FIELDS dict for all filterable fields with descriptions. +""" + +from typing import TYPE_CHECKING, Dict, Any + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class AmazonProducts(BaseDataset): + """ + Amazon Products dataset. + + Access Amazon product records with filtering. + + Example: + >>> products = client.datasets.amazon_products + >>> metadata = await products.get_metadata() + >>> snapshot_id = await products.filter( + ... filter={"name": "rating", "operator": ">", "value": 4.5}, + ... records_limit=100 + ... ) + >>> data = await products.download(snapshot_id) + """ + + DATASET_ID = "gd_l7q7dkf244hwjntr0" + NAME = "amazon_products" + + # All available fields with metadata + # Format: field_name -> {"type": str, "description": str} + FIELDS: Dict[str, Dict[str, Any]] = { + # Core product identification + "title": { + "type": "text", + "description": "Product title/name", + }, + "asin": { + "type": "text", + "description": "Amazon Standard Identification Number", + }, + "parent_asin": { + "type": "text", + "description": "Parent ASIN for product variations", + }, + "input_asin": { + "type": "text", + "description": "Original input ASIN used for scraping", + }, + "url": { + "type": "url", + "description": "Full product page URL", + }, + "origin_url": { + "type": "url", + "description": "Original source URL", + }, + "domain": { + "type": "text", + "description": "Amazon domain (e.g., amazon.com)", + }, + # Brand & seller + "brand": { + "type": "text", + "description": "Product brand name", + }, + "seller_name": { + "type": "text", + "description": "Name of the seller", + }, + "seller_id": { + "type": "text", + "description": "Unique seller identifier", + }, + "seller_url": { + "type": "url", + "description": "URL to seller's storefront", + }, + "manufacturer": { + "type": "text", + "description": "Product manufacturer", + }, + "buybox_seller": { + "type": "text", + "description": "Current Buy Box winner seller", + }, + "number_of_sellers": { + "type": "number", + "description": "Number of sellers offering this product", + }, + "buybox_seller_rating": { + "type": "number", + "description": "Buy Box seller's rating", + }, + # Pricing + "initial_price": { + "type": "number", + "description": "Original/list price", + }, + "final_price": { + "type": "number", + "description": "Current selling price", + }, + "final_price_high": { + "type": "number", + "description": "High end of price range (for variations)", + }, + "currency": { + "type": "text", + "description": "Price currency code (e.g., USD)", + }, + "discount": { + "type": "text", + "description": "Discount percentage or amount", + }, + "buybox_prices": { + "type": "object", + "description": "Buy Box pricing details", + }, + "prices_breakdown": { + "type": "object", + "description": "Detailed price breakdown (list, deal, typical)", + }, + "other_sellers_prices": { + "type": "array", + "description": "Prices from other sellers", + }, + "coupon": { + "type": "text", + "description": "Available coupon code", + }, + "coupon_description": { + "type": "text", + "description": "Description of coupon discount", + }, + "inactive_buy_box": { + "type": "object", + "description": "Inactive Buy Box information", + }, + # Availability & shipping + "availability": { + "type": "text", + "description": "Stock availability status", + }, + "is_available": { + "type": "boolean", + "description": "Whether product is currently available", + }, + "max_quantity_available": { + "type": "number", + "description": "Maximum quantity available for purchase", + }, + "delivery": { + "type": "array", + "description": "Delivery options and dates", + }, + "ships_from": { + "type": "text", + "description": "Shipping origin location", + }, + "zipcode": { + "type": "text", + "description": "Delivery zipcode context", + }, + "city": { + "type": "text", + "description": "Delivery city context", + }, + "return_policy": { + "type": "text", + "description": "Return policy description", + }, + # Ratings & reviews + "rating": { + "type": "number", + "description": "Average star rating (0-5)", + }, + "reviews_count": { + "type": "number", + "description": "Total number of customer reviews", + }, + "answered_questions": { + "type": "number", + "description": "Number of answered Q&A", + }, + "top_review": { + "type": "text", + "description": "Featured/top customer review", + }, + "customer_says": { + "type": "text", + "description": "AI-generated customer sentiment summary", + }, + "customers_say": { + "type": "object", + "description": "Detailed customer feedback analysis", + }, + # Categories & rankings + "categories": { + "type": "array", + "description": "Product category hierarchy", + }, + "root_bs_category": { + "type": "text", + "description": "Root best seller category", + }, + "bs_category": { + "type": "text", + "description": "Best seller subcategory", + }, + "root_bs_rank": { + "type": "number", + "description": "Best seller rank in root category", + }, + "bs_rank": { + "type": "number", + "description": "Best seller rank in subcategory", + }, + "subcategory_rank": { + "type": "array", + "description": "Rankings in subcategories", + }, + "department": { + "type": "text", + "description": "Product department", + }, + # Badges & features + "badge": { + "type": "text", + "description": "Product badge (e.g., Best Seller)", + }, + "all_badges": { + "type": "array", + "description": "All product badges", + }, + "amazon_choice": { + "type": "boolean", + "description": "Whether product is Amazon's Choice", + }, + "amazon_prime": { + "type": "boolean", + "description": "Whether eligible for Prime", + }, + "premium_brand": { + "type": "boolean", + "description": "Whether a premium brand", + }, + "climate_pledge_friendly": { + "type": "boolean", + "description": "Climate Pledge Friendly certification", + }, + "sustainability_features": { + "type": "array", + "description": "Sustainability certifications and features", + }, + "sponsored": { + "type": "boolean", + "description": "Whether product listing is sponsored", + }, + # Product details + "description": { + "type": "text", + "description": "Short product description", + }, + "product_description": { + "type": "text", + "description": "Full product description", + }, + "features": { + "type": "array", + "description": "Product feature bullet points", + }, + "product_details": { + "type": "array", + "description": "Technical product specifications", + }, + "product_dimensions": { + "type": "text", + "description": "Product size dimensions", + }, + "item_weight": { + "type": "text", + "description": "Product weight", + }, + "model_number": { + "type": "text", + "description": "Manufacturer model number", + }, + "upc": { + "type": "text", + "description": "Universal Product Code", + }, + "ISBN10": { + "type": "text", + "description": "ISBN-10 for books", + }, + "ingredients": { + "type": "text", + "description": "Product ingredients (for applicable items)", + }, + "country_of_origin": { + "type": "text", + "description": "Country where product is made", + }, + "date_first_available": { + "type": "text", + "description": "Date product was first listed", + }, + "format": { + "type": "text", + "description": "Product format (for media items)", + }, + "language": { + "type": "text", + "description": "Product language", + }, + # Images & media + "image": { + "type": "url", + "description": "Main product image URL", + }, + "image_url": { + "type": "url", + "description": "Primary image URL", + }, + "images": { + "type": "array", + "description": "All product image URLs", + }, + "images_count": { + "type": "number", + "description": "Number of product images", + }, + "video": { + "type": "url", + "description": "Product video URL", + }, + "videos": { + "type": "array", + "description": "All product video URLs", + }, + "video_count": { + "type": "number", + "description": "Number of product videos", + }, + "downloadable_videos": { + "type": "array", + "description": "Downloadable video URLs", + }, + # Variations + "variations": { + "type": "array", + "description": "Product variations (size, color, etc.)", + }, + "variations_values": { + "type": "array", + "description": "Available variation options", + }, + # Enhanced content + "plus_content": { + "type": "boolean", + "description": "Whether has A+ Content", + }, + "from_the_brand": { + "type": "array", + "description": "Brand story/content section", + }, + "editorial_reviews": { + "type": "array", + "description": "Editorial review content", + }, + "about_the_author": { + "type": "text", + "description": "Author bio (for books)", + }, + # Store & purchase info + "store_url": { + "type": "url", + "description": "Brand store URL", + }, + "bought_past_month": { + "type": "number", + "description": "Units sold in past month", + }, + } + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + + @classmethod + def get_field_names(cls) -> list: + """Get list of all field names.""" + return list(cls.FIELDS.keys()) + + @classmethod + def get_fields_by_type(cls, field_type: str) -> list: + """Get fields of a specific type (text, number, array, object, url, boolean).""" + return [name for name, info in cls.FIELDS.items() if info.get("type") == field_type] + + @classmethod + def get_pricing_fields(cls) -> list: + """Get all pricing-related fields.""" + pricing_keywords = ["price", "cost", "discount", "coupon"] + return [ + name for name in cls.FIELDS.keys() if any(kw in name.lower() for kw in pricing_keywords) + ] diff --git a/src/brightdata/datasets/base.py b/src/brightdata/datasets/base.py new file mode 100644 index 0000000..62e008f --- /dev/null +++ b/src/brightdata/datasets/base.py @@ -0,0 +1,221 @@ +""" +Base dataset class - provides common functionality for all datasets. +""" + +import asyncio +import time +from typing import Dict, List, Any, Optional, Literal, TYPE_CHECKING + +from .models import DatasetMetadata, SnapshotStatus + +if TYPE_CHECKING: + from ..core.async_engine import AsyncEngine + + +class DatasetError(Exception): + """Error related to dataset operations.""" + + pass + + +class BaseDataset: + """ + Base class for all dataset types. + + Provides common methods: get_metadata(), filter(), get_status(), download(). + Subclasses set their own DATASET_ID and can add dataset-specific helpers. + """ + + BASE_URL = "https://api.brightdata.com" + DATASET_ID: str = "" # Override in subclasses + NAME: str = "" # Override in subclasses + + def __init__(self, engine: "AsyncEngine"): + self._engine = engine + self._metadata: Optional[DatasetMetadata] = None + + @property + def dataset_id(self) -> str: + return self.DATASET_ID + + @property + def name(self) -> str: + return self.NAME + + async def get_metadata(self) -> DatasetMetadata: + """ + Get dataset field schema. + + Returns field names, types, and descriptions for this dataset. + Use this to discover what fields you can filter by. + + Returns: + DatasetMetadata with fields dict + """ + if self._metadata is None: + async with self._engine.get_from_url( + f"{self.BASE_URL}/datasets/{self.DATASET_ID}/metadata" + ) as response: + data = await response.json() + self._metadata = DatasetMetadata.from_dict(data) + return self._metadata + + async def filter( + self, + filter: Dict[str, Any], + records_limit: Optional[int] = None, + ) -> str: + """ + Filter dataset records and create a snapshot. + + Returns snapshot_id immediately - does NOT wait for results. + Use download() to poll and get the data. + + Args: + filter: Filter criteria. Example: + {"name": "industry", "operator": "=", "value": "Technology"} + Or with AND/OR: + { + "operator": "and", + "filters": [ + {"name": "industry", "operator": "=", "value": "Technology"}, + {"name": "followers", "operator": ">", "value": 10000} + ] + } + records_limit: Maximum number of records to return + + Returns: + snapshot_id (str) - use with download() to get data + """ + payload: Dict[str, Any] = { + "dataset_id": self.DATASET_ID, + "filter": filter, + } + if records_limit is not None: + payload["records_limit"] = records_limit + + async with self._engine.post_to_url( + f"{self.BASE_URL}/datasets/filter", + json_data=payload, + ) as response: + data = await response.json() + return data["snapshot_id"] + + async def get_status(self, snapshot_id: str) -> SnapshotStatus: + """ + Check snapshot status. + + Args: + snapshot_id: Snapshot ID from filter() + + Returns: + SnapshotStatus with status field: "scheduled", "building", "ready", or "failed" + """ + async with self._engine.get_from_url( + f"{self.BASE_URL}/datasets/snapshots/{snapshot_id}" + ) as response: + data = await response.json() + return SnapshotStatus.from_dict(data) + + async def download( + self, + snapshot_id: str, + format: Literal["json", "jsonl", "csv"] = "jsonl", + timeout: int = 300, + poll_interval: int = 5, + ) -> List[Dict[str, Any]]: + """ + Download snapshot data. + + Polls until snapshot is ready, then downloads and returns data. + + Args: + snapshot_id: Snapshot ID from filter() + format: Response format (json, jsonl, csv) + timeout: Max seconds to wait for snapshot to be ready + poll_interval: Seconds between status checks + + Returns: + List of records (dicts) + + Raises: + DatasetError: If snapshot fails + TimeoutError: If snapshot not ready within timeout + """ + start_time = time.time() + + # Poll until ready + while True: + status = await self.get_status(snapshot_id) + + if status.status == "ready": + break + elif status.status == "failed": + raise DatasetError(f"Snapshot failed: {status.error}") + elif time.time() - start_time > timeout: + raise TimeoutError( + f"Snapshot {snapshot_id} not ready after {timeout}s " + f"(status: {status.status})" + ) + + await asyncio.sleep(poll_interval) + + # Download data + async with self._engine.get_from_url( + f"{self.BASE_URL}/datasets/snapshots/{snapshot_id}/download", + params={"format": format}, + ) as response: + import json + + # Check for HTTP errors + if response.status >= 400: + error_text = await response.text() + raise DatasetError(f"Download failed (HTTP {response.status}): {error_text}") + + # Get raw text first + text = await response.text() + + # Handle empty response + if not text or not text.strip(): + return [] + + # Try to parse based on content type and format + content_type = response.headers.get("Content-Type", "") + + # Try JSON first (most common) + if "application/json" in content_type or text.strip().startswith("["): + try: + data = json.loads(text) + except json.JSONDecodeError: + pass + else: + # Successfully parsed as JSON + if isinstance(data, list): + return data + elif isinstance(data, dict) and "data" in data: + return data["data"] + else: + return [data] if data else [] + + # Try JSONL (newline-delimited JSON) + if "ndjson" in content_type or format == "jsonl" or "\n" in text.strip(): + try: + lines = [line.strip() for line in text.strip().split("\n") if line.strip()] + if lines: + data = [json.loads(line) for line in lines] + return data + except json.JSONDecodeError: + pass + + # Last resort: try as single JSON object + try: + data = json.loads(text) + if isinstance(data, list): + return data + elif isinstance(data, dict) and "data" in data: + return data["data"] + else: + return [data] if data else [] + except json.JSONDecodeError: + # Return raw text as fallback + return [{"raw": text}] diff --git a/src/brightdata/datasets/client.py b/src/brightdata/datasets/client.py new file mode 100644 index 0000000..70081c2 --- /dev/null +++ b/src/brightdata/datasets/client.py @@ -0,0 +1,136 @@ +""" +Datasets client - main entry point for datasets API. +""" + +from typing import List, Optional, TYPE_CHECKING + +from .models import DatasetInfo +from .linkedin import LinkedInPeopleProfiles, LinkedInCompanyProfiles +from .amazon import AmazonProducts +from .crunchbase import CrunchbaseCompanies +from .imdb import IMDBMovies +from .nba import NBAPlayersStats +from .goodreads import GoodreadsBooks +from .world_population import WorldPopulation + +if TYPE_CHECKING: + from ..core.async_engine import AsyncEngine + + +class DatasetsClient: + """ + Client for Bright Data Datasets API. + + Access pre-collected datasets and filter records. + + Usage: + async with BrightDataClient() as client: + # List all datasets + datasets = await client.datasets.list() + + # Get metadata for a specific dataset + metadata = await client.datasets.linkedin_profiles.get_metadata() + + # Filter records + snapshot_id = await client.datasets.linkedin_profiles.filter( + filter={"name": "industry", "operator": "=", "value": "Technology"}, + records_limit=100 + ) + + # Download results + data = await client.datasets.linkedin_profiles.download(snapshot_id) + """ + + BASE_URL = "https://api.brightdata.com" + + def __init__(self, engine: "AsyncEngine"): + self._engine = engine + + # Lazy-loaded dataset instances + self._linkedin_profiles: Optional[LinkedInPeopleProfiles] = None + self._linkedin_companies: Optional[LinkedInCompanyProfiles] = None + self._amazon_products: Optional[AmazonProducts] = None + self._crunchbase_companies: Optional[CrunchbaseCompanies] = None + self._imdb_movies: Optional[IMDBMovies] = None + self._nba_players_stats: Optional[NBAPlayersStats] = None + self._goodreads_books: Optional[GoodreadsBooks] = None + self._world_population: Optional[WorldPopulation] = None + + async def list(self) -> List[DatasetInfo]: + """ + List all available datasets. + + Returns: + List of DatasetInfo with id, name, and size + """ + async with self._engine.get_from_url(f"{self.BASE_URL}/datasets/list") as response: + data = await response.json() + + datasets = [] + for item in data: + datasets.append( + DatasetInfo( + id=item.get("id", ""), + name=item.get("name", ""), + size=item.get("size", 0), + ) + ) + return datasets + + # Dataset properties for IDE autocomplete + + @property + def linkedin_profiles(self) -> LinkedInPeopleProfiles: + """LinkedIn People Profiles dataset (620M+ records).""" + if self._linkedin_profiles is None: + self._linkedin_profiles = LinkedInPeopleProfiles(self._engine) + return self._linkedin_profiles + + @property + def linkedin_companies(self) -> LinkedInCompanyProfiles: + """LinkedIn Company Profiles dataset.""" + if self._linkedin_companies is None: + self._linkedin_companies = LinkedInCompanyProfiles(self._engine) + return self._linkedin_companies + + @property + def amazon_products(self) -> AmazonProducts: + """Amazon Products dataset.""" + if self._amazon_products is None: + self._amazon_products = AmazonProducts(self._engine) + return self._amazon_products + + @property + def crunchbase_companies(self) -> CrunchbaseCompanies: + """Crunchbase Companies dataset (2.3M+ records).""" + if self._crunchbase_companies is None: + self._crunchbase_companies = CrunchbaseCompanies(self._engine) + return self._crunchbase_companies + + @property + def imdb_movies(self) -> IMDBMovies: + """IMDB Movies dataset (867K+ records).""" + if self._imdb_movies is None: + self._imdb_movies = IMDBMovies(self._engine) + return self._imdb_movies + + @property + def nba_players_stats(self) -> NBAPlayersStats: + """NBA Players Stats dataset (17K+ records).""" + if self._nba_players_stats is None: + self._nba_players_stats = NBAPlayersStats(self._engine) + return self._nba_players_stats + + @property + def goodreads_books(self) -> GoodreadsBooks: + """Goodreads Books dataset.""" + if self._goodreads_books is None: + self._goodreads_books = GoodreadsBooks(self._engine) + return self._goodreads_books + + @property + def world_population(self) -> WorldPopulation: + """World Population dataset.""" + if self._world_population is None: + self._world_population = WorldPopulation(self._engine) + return self._world_population diff --git a/src/brightdata/datasets/crunchbase/__init__.py b/src/brightdata/datasets/crunchbase/__init__.py new file mode 100644 index 0000000..5d1fdcc --- /dev/null +++ b/src/brightdata/datasets/crunchbase/__init__.py @@ -0,0 +1,5 @@ +"""Crunchbase datasets.""" + +from .companies import CrunchbaseCompanies + +__all__ = ["CrunchbaseCompanies"] diff --git a/src/brightdata/datasets/crunchbase/companies.py b/src/brightdata/datasets/crunchbase/companies.py new file mode 100644 index 0000000..18e8051 --- /dev/null +++ b/src/brightdata/datasets/crunchbase/companies.py @@ -0,0 +1,602 @@ +""" +Crunchbase Companies dataset. + +Dataset ID: gd_l1vijqt9jfj7olije +Records: 2.3M+ companies + +See FIELDS dict for all filterable fields with descriptions and fill rates. +""" + +from typing import TYPE_CHECKING, Dict, Any + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class CrunchbaseCompanies(BaseDataset): + """ + Crunchbase Companies dataset. + + Access 2.3M+ Crunchbase company records with filtering. + + Example: + >>> companies = client.datasets.crunchbase_companies + >>> metadata = await companies.get_metadata() + >>> snapshot_id = await companies.filter( + ... filter={"name": "num_employees", "operator": ">", "value": 100}, + ... records_limit=100 + ... ) + >>> data = await companies.download(snapshot_id) + """ + + DATASET_ID = "gd_l1vijqt9jfj7olije" + NAME = "crunchbase_companies" + + # All available fields with metadata + # Format: field_name -> {"type": str, "description": str, "fill_rate": float} + FIELDS: Dict[str, Dict[str, Any]] = { + # Core identification + "name": { + "type": "text", + "description": "The name of the company", + "fill_rate": 100.00, + }, + "url": { + "type": "url", + "description": "The URL or web address associated with the company", + "fill_rate": 100.00, + }, + "id": { + "type": "text", + "description": "A unique identifier for each company in Crunchbase", + "fill_rate": 100.00, + }, + "uuid": { + "type": "text", + "description": "Universally unique identifier for the company", + "fill_rate": 100.00, + }, + "company_id": { + "type": "text", + "description": "A unique identifier for each company in Crunchbase", + "fill_rate": 99.07, + }, + "type": { + "type": "text", + "description": "Type of data entry", + "fill_rate": 100.00, + }, + # Company info + "about": { + "type": "text", + "description": "Overview or description of the company", + "fill_rate": 100.00, + }, + "full_description": { + "type": "text", + "description": "Detailed description of the company", + "fill_rate": 100.00, + }, + "company_overview": { + "type": "text", + "description": "Overview or description of the company", + "fill_rate": 99.07, + }, + "legal_name": { + "type": "text", + "description": "Legal name of the company", + "fill_rate": 59.62, + }, + "cb_rank": { + "type": "number", + "description": "Crunchbase rank assigned to the company", + "fill_rate": 97.02, + }, + "image": { + "type": "url", + "description": "Image or logo associated with the company", + "fill_rate": 94.22, + }, + # Status & type + "operating_status": { + "type": "text", + "description": "The current operating status of the company", + "fill_rate": 100.00, + }, + "company_type": { + "type": "text", + "description": "The type of company (eg, private, public)", + "fill_rate": 96.41, + }, + "ipo_status": { + "type": "text", + "description": "Status of the company regarding Initial Public Offering (IPO)", + "fill_rate": 99.94, + }, + "investor_type": { + "type": "text", + "description": "Type of investor", + "fill_rate": 0.00, + }, + # Location + "region": { + "type": "text", + "description": "The continent where the company's headquarters is located", + "fill_rate": 93.28, + }, + "country_code": { + "type": "text", + "description": "The country code where the company is located", + "fill_rate": 93.50, + }, + "hq_continent": { + "type": "text", + "description": "The continent where the company's headquarters is located", + "fill_rate": 92.59, + }, + "address": { + "type": "text", + "description": "Physical address of the company", + "fill_rate": 93.50, + }, + "location": { + "type": "array", + "description": "Location information for the company", + "fill_rate": 93.50, + "nested_fields": 2, + }, + "headquarters_regions": { + "type": "array", + "description": "Regions where the company has headquarters", + "fill_rate": 91.62, + "nested_fields": 2, + }, + # Industries & products + "industries": { + "type": "array", + "description": "Industries associated with the company", + "fill_rate": 94.51, + "nested_fields": 2, + }, + "total_active_products": { + "type": "number", + "description": "Total number of active products", + "fill_rate": 14.54, + }, + "siftery_products": { + "type": "array", + "description": "Products listed by Siftery", + "fill_rate": 14.45, + "nested_fields": 3, + }, + # Employees & contacts + "num_employees": { + "type": "text", + "description": "The number of employees in the company", + "fill_rate": 86.28, + }, + "num_employee_profiles": { + "type": "number", + "description": "Number of employee profiles associated with the company", + "fill_rate": 99.94, + }, + "number_of_employee_profiles": { + "type": "number", + "description": "Number of employee profiles associated with the company", + "fill_rate": 99.07, + }, + "num_contacts": { + "type": "number", + "description": "Total number of contacts associated with the company", + "fill_rate": 34.63, + }, + "number_of_contacts": { + "type": "number", + "description": "Total number of contacts associated with the company", + "fill_rate": 34.10, + }, + "num_contacts_linkedin": { + "type": "number", + "description": "Number of LinkedIn contacts", + "fill_rate": 34.64, + }, + "number_of_linkedin_contacts": { + "type": "number", + "description": "Number of LinkedIn contacts", + "fill_rate": 34.10, + }, + "contacts": { + "type": "array", + "description": "Contact information for the company", + "fill_rate": 46.38, + "nested_fields": 5, + }, + "current_employees": { + "type": "array", + "description": "Number of current employees", + "fill_rate": 25.29, + "nested_fields": 4, + }, + "num_alumni": { + "type": "number", + "description": "Total number of company alumni", + "fill_rate": 0.02, + }, + "alumni": { + "type": "array", + "description": "Information about company alumni", + "fill_rate": 0.61, + "nested_fields": 4, + }, + # Contact info + "website": { + "type": "text", + "description": "The official website of the company", + "fill_rate": 97.36, + }, + "contact_email": { + "type": "text", + "description": "Contact email address for the company", + "fill_rate": 74.28, + }, + "email_address": { + "type": "text", + "description": "Contact email address for the company", + "fill_rate": 73.56, + }, + "contact_phone": { + "type": "text", + "description": "Contact phone number for the company", + "fill_rate": 77.90, + }, + "phone_number": { + "type": "text", + "description": "Contact phone number for the company", + "fill_rate": 77.25, + }, + "social_media_links": { + "type": "array", + "description": "URLs of social media profiles associated with the company", + "fill_rate": 86.85, + }, + "socila_media_urls": { + "type": "array", + "description": "URLs of social media profiles associated with the company", + "fill_rate": 85.95, + }, + # Founding & dates + "founded_date": { + "type": "text", + "description": "The date when the company was founded", + "fill_rate": 2.42, + }, + # Funding & investments + "num_investors": { + "type": "number", + "description": "Number of investors in the company", + "fill_rate": 8.24, + }, + "investors": { + "type": "array", + "description": "List of investors in the company", + "fill_rate": 8.24, + "nested_fields": 6, + }, + "num_investments": { + "type": "number", + "description": "Total number of investments made by the company", + "fill_rate": 2.61, + }, + "investments": { + "type": "array", + "description": "Information about company investments", + "fill_rate": 2.61, + "nested_fields": 7, + }, + "num_investments_lead": { + "type": "number", + "description": "Number of investments led by the company", + "fill_rate": 1.40, + }, + "funding_rounds_list": { + "type": "array", + "description": "List of funding rounds", + "fill_rate": 10.06, + "nested_fields": 8, + }, + "funds_raised": { + "type": "array", + "description": "Total funds raised by the company", + "fill_rate": 2.61, + "nested_fields": 5, + }, + "num_funds": { + "type": "number", + "description": "Total number of funds", + "fill_rate": 0.31, + }, + "funds_list": { + "type": "array", + "description": "List of funds associated with the company", + "fill_rate": 0.31, + "nested_fields": 3, + }, + "num_diversity_spotlight_investments": { + "type": "number", + "description": "Number of diversity spotlight investments", + "fill_rate": 0.47, + }, + "diversity_investments": { + "type": "array", + "description": "Information about diversity investments", + "fill_rate": 0.47, + "nested_fields": 7, + }, + # Acquisitions & exits + "num_acquisitions": { + "type": "number", + "description": "Total number of acquisitions by the company", + "fill_rate": 1.88, + }, + "acquisitions": { + "type": "array", + "description": "Information about company acquisitions", + "fill_rate": 1.88, + "nested_fields": 4, + }, + "acquired_by": { + "type": "object", + "description": "Information about the acquiring entity", + "fill_rate": 4.57, + "nested_fields": 5, + }, + "num_exits": { + "type": "number", + "description": "Information about company exits", + "fill_rate": 0.06, + }, + "exits": { + "type": "array", + "description": "Information about company exits", + "fill_rate": 0.94, + "nested_fields": 4, + }, + # Organization structure + "num_sub_organizations": { + "type": "number", + "description": "Total number of sub-organizations", + "fill_rate": 0.53, + }, + "sub_organizations": { + "type": "array", + "description": "Sub-organizations associated with the company", + "fill_rate": 0.53, + "nested_fields": 4, + }, + "sub_organization_of": { + "type": "text", + "description": "Information about being a sub-organization of another entity", + "fill_rate": 0.80, + }, + # People + "founders": { + "type": "array", + "description": "Information about the founders of the company", + "fill_rate": 21.93, + "nested_fields": 3, + }, + "num_founder_alumni": { + "type": "number", + "description": "Total number of founder alumni", + "fill_rate": 0.01, + }, + "num_advisor_positions": { + "type": "number", + "description": "Number of advisory positions associated with the company", + "fill_rate": 3.51, + }, + "current_advisors": { + "type": "array", + "description": "List of current advisors for the company", + "fill_rate": 3.51, + "nested_fields": 4, + }, + "leadership_hire": { + "type": "array", + "description": "Leadership hiring information", + "fill_rate": 1.61, + "nested_fields": 4, + }, + "layoff": { + "type": "array", + "description": "Layoff information", + "fill_rate": 0.28, + "nested_fields": 4, + }, + "people_highlights": { + "type": "object", + "description": "Highlights of people associated with the company", + "fill_rate": 47.68, + "nested_fields": 3, + }, + # Technology + "active_tech_count": { + "type": "number", + "description": "Number of active technologies used by the company", + "fill_rate": 95.47, + }, + "builtwith_num_technologies_used": { + "type": "number", + "description": "Number of technologies the company is built with", + "fill_rate": 95.47, + }, + "built_with_num_technologies_used": { + "type": "number", + "description": "Number of technologies the company is built with", + "fill_rate": 94.61, + }, + "builtwith_tech": { + "type": "array", + "description": "Technologies used by the company", + "fill_rate": 93.77, + "nested_fields": 3, + }, + "built_with_tech": { + "type": "array", + "description": "Technologies used by the company", + "fill_rate": 92.91, + "nested_fields": 3, + }, + "technology_highlights": { + "type": "object", + "description": "Highlights of technologies used by the company", + "fill_rate": 96.06, + "nested_fields": 4, + }, + # Traffic & analytics + "monthly_visits": { + "type": "number", + "description": "Number of monthly website visits", + "fill_rate": 52.61, + }, + "monthly_visits_growth": { + "type": "number", + "description": "Growth in monthly visits", + "fill_rate": 44.34, + }, + "semrush_visits_latest_month": { + "type": "number", + "description": "Latest monthly visits data from SEMrush", + "fill_rate": 52.61, + }, + "semrush_visits_mom_pct": { + "type": "number", + "description": "Percentage growth in SEMrush visits", + "fill_rate": 44.34, + }, + "semrush_last_updated": { + "type": "text", + "description": "Last update date for SEMrush data", + "fill_rate": 52.61, + }, + "semrush_location_list": { + "type": "array", + "description": "List of locations according to SEMrush", + "fill_rate": 1.78, + "nested_fields": 5, + }, + # Third-party data + "bombora": { + "type": "array", + "description": "Bombora information", + "fill_rate": 21.27, + "nested_fields": 5, + }, + "bombora_last_updated": { + "type": "text", + "description": "Last update date for Bombora data", + "fill_rate": 24.26, + }, + "apptopia": { + "type": "array", + "description": "Apptopia data", + "fill_rate": 5.56, + "nested_fields": 4, + }, + "apptopia_total_downloads": { + "type": "number", + "description": "Total downloads according to Apptopia", + "fill_rate": 1.56, + }, + "apptopia_total_downloads_mom_pct": { + "type": "text", + "description": "Month-over-month percentage change in downloads", + "fill_rate": 1.35, + }, + "aberdeen_it_spend": { + "type": "object", + "description": "IT spending data from Aberdeen", + "fill_rate": 56.04, + "nested_fields": 3, + }, + "ipqwery": { + "type": "object", + "description": "IPQwery data", + "fill_rate": 8.42, + "nested_fields": 4, + }, + # Events & news + "num_event_appearances": { + "type": "number", + "description": "Number of appearances in events", + "fill_rate": 0.10, + }, + "event_appearances": { + "type": "array", + "description": "Number of times the company has appeared in events", + "fill_rate": 0.10, + "nested_fields": 5, + }, + "num_news": { + "type": "number", + "description": "Number of news articles related to the company", + "fill_rate": 0.53, + }, + "news": { + "type": "array", + "description": "News related to the company", + "fill_rate": 27.91, + "nested_fields": 6, + }, + # Lists & features + "featured_list": { + "type": "array", + "description": "Indicates if the company is featured on a list", + "fill_rate": 95.71, + "nested_fields": 4, + }, + "similar_companies": { + "type": "array", + "description": "List of companies similar to the specified company", + "fill_rate": 57.21, + "nested_fields": 2, + }, + # Financial highlights + "financials_highlights": { + "type": "object", + "description": "Highlights of financial data", + "fill_rate": 10.87, + "nested_fields": 4, + }, + "ipo_fields": { + "type": "object", + "description": "Information related to Initial Public Offering (IPO)", + "fill_rate": 1.49, + "nested_fields": 5, + }, + "stock_symbol": { + "type": "text", + "description": "Stock symbol associated with the company", + "fill_rate": 0.42, + }, + } + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + + @classmethod + def get_field_names(cls) -> list: + """Get list of all field names.""" + return list(cls.FIELDS.keys()) + + @classmethod + def get_high_fill_rate_fields(cls, min_rate: float = 50.0) -> list: + """Get fields with fill rate above threshold.""" + return [name for name, info in cls.FIELDS.items() if info.get("fill_rate", 0) >= min_rate] + + @classmethod + def get_fields_by_type(cls, field_type: str) -> list: + """Get fields of a specific type (text, number, array, object, url).""" + return [name for name, info in cls.FIELDS.items() if info.get("type") == field_type] diff --git a/src/brightdata/datasets/goodreads/__init__.py b/src/brightdata/datasets/goodreads/__init__.py new file mode 100644 index 0000000..4567b77 --- /dev/null +++ b/src/brightdata/datasets/goodreads/__init__.py @@ -0,0 +1,5 @@ +"""Goodreads dataset.""" + +from .books import GoodreadsBooks + +__all__ = ["GoodreadsBooks"] diff --git a/src/brightdata/datasets/goodreads/books.py b/src/brightdata/datasets/goodreads/books.py new file mode 100644 index 0000000..3c43689 --- /dev/null +++ b/src/brightdata/datasets/goodreads/books.py @@ -0,0 +1,121 @@ +""" +Goodreads Books dataset. + +Dataset ID: gd_lreq6ho72fhvovjj7a + +See FIELDS dict for all filterable fields with descriptions. +""" + +from typing import TYPE_CHECKING, Dict, Any + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class GoodreadsBooks(BaseDataset): + """ + Goodreads Books dataset. + + Access Goodreads book records with filtering. + + Example: + >>> books = client.datasets.goodreads_books + >>> metadata = await books.get_metadata() + >>> snapshot_id = await books.filter( + ... filter={"name": "star_rating", "operator": ">", "value": 4.0}, + ... records_limit=100 + ... ) + >>> data = await books.download(snapshot_id) + """ + + DATASET_ID = "gd_lreq6ho72fhvovjj7a" + NAME = "goodreads_books" + + # All available fields with metadata + FIELDS: Dict[str, Dict[str, Any]] = { + # Book identification + "id": { + "type": "text", + "description": "Goodreads book ID", + }, + "url": { + "type": "url", + "description": "Goodreads book page URL", + }, + "isbn": { + "type": "text", + "description": "ISBN number", + }, + # Book details + "name": { + "type": "text", + "description": "Book title", + }, + "author": { + "type": "array", + "description": "Author name(s)", + }, + "summary": { + "type": "text", + "description": "Book summary/description", + }, + "genres": { + "type": "array", + "description": "Book genres/categories", + }, + "first_published": { + "type": "text", + "description": "First publication date", + }, + # Ratings & reviews + "star_rating": { + "type": "number", + "description": "Average star rating (0-5)", + }, + "num_ratings": { + "type": "number", + "description": "Total number of ratings", + }, + "num_reviews": { + "type": "number", + "description": "Total number of reviews", + }, + "community_reviews": { + "type": "object", + "description": "Breakdown of reviews by star rating", + }, + # Author info + "about_author": { + "type": "object", + "description": "Author information (name, books, followers)", + }, + # Pricing + "kindle_price": { + "type": "text", + "description": "Kindle edition price", + }, + } + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + + @classmethod + def get_field_names(cls) -> list: + """Get list of all field names.""" + return list(cls.FIELDS.keys()) + + @classmethod + def get_fields_by_type(cls, field_type: str) -> list: + """Get fields of a specific type (text, number, array, object, url, boolean).""" + return [name for name, info in cls.FIELDS.items() if info.get("type") == field_type] + + @classmethod + def get_rating_fields(cls) -> list: + """Get all rating-related fields.""" + return [ + name + for name in cls.FIELDS.keys() + if "rating" in name.lower() or "review" in name.lower() + ] diff --git a/src/brightdata/datasets/imdb/__init__.py b/src/brightdata/datasets/imdb/__init__.py new file mode 100644 index 0000000..e0010a9 --- /dev/null +++ b/src/brightdata/datasets/imdb/__init__.py @@ -0,0 +1,5 @@ +"""IMDB dataset.""" + +from .movies import IMDBMovies + +__all__ = ["IMDBMovies"] diff --git a/src/brightdata/datasets/imdb/movies.py b/src/brightdata/datasets/imdb/movies.py new file mode 100644 index 0000000..a7face3 --- /dev/null +++ b/src/brightdata/datasets/imdb/movies.py @@ -0,0 +1,195 @@ +""" +IMDB Movies dataset. + +Dataset ID: gd_l1vikf2h1a4t6x8qzu + +See FIELDS dict for all filterable fields with descriptions. +""" + +from typing import TYPE_CHECKING, Dict, Any + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class IMDBMovies(BaseDataset): + """ + IMDB Movies dataset. + + Access IMDB movie/TV records with filtering. + + Example: + >>> movies = client.datasets.imdb_movies + >>> metadata = await movies.get_metadata() + >>> snapshot_id = await movies.filter( + ... filter={"name": "imdb_rating", "operator": ">", "value": 8.0}, + ... records_limit=100 + ... ) + >>> data = await movies.download(snapshot_id) + """ + + DATASET_ID = "gd_l1vikf2h1a4t6x8qzu" + NAME = "imdb_movies" + + # All available fields with metadata + FIELDS: Dict[str, Dict[str, Any]] = { + # Core identification + "id": { + "type": "text", + "description": "IMDB title ID (e.g., tt5931912)", + }, + "title": { + "type": "text", + "description": "Movie/show title", + }, + "url": { + "type": "url", + "description": "IMDB page URL", + }, + "media_type": { + "type": "text", + "description": "Type of media (Feature Film, Documentary, etc.)", + }, + # Ratings & reviews + "imdb_rating": { + "type": "number", + "description": "IMDB rating (0-10)", + }, + "imdb_rating_count": { + "type": "number", + "description": "Number of IMDB ratings", + }, + "popularity": { + "type": "number", + "description": "Popularity score", + }, + "review_count": { + "type": "number", + "description": "Number of user reviews", + }, + "review_rating": { + "type": "number", + "description": "Average user review rating", + }, + "critics_review_count": { + "type": "number", + "description": "Number of critic reviews", + }, + "featured_review": { + "type": "text", + "description": "Featured user review text", + }, + # Content details + "genres": { + "type": "array", + "description": "List of genres (e.g., Drama, Comedy)", + }, + "presentation": { + "type": "text", + "description": "Short presentation/tagline", + }, + "storyline": { + "type": "text", + "description": "Plot summary/storyline", + }, + "comment": { + "type": "text", + "description": "Additional comments", + }, + # Cast & crew + "credit": { + "type": "array", + "description": "Credits (directors, writers, etc.)", + }, + "top_cast": { + "type": "array", + "description": "Top cast members with character names", + }, + # Release details + "details_release_date": { + "type": "text", + "description": "Release date", + }, + "details_countries_of_origin": { + "type": "text", + "description": "Countries of origin", + }, + "details_language": { + "type": "text", + "description": "Languages", + }, + "details_also_known_as": { + "type": "text", + "description": "Alternative titles", + }, + "details_filming_locations": { + "type": "text", + "description": "Filming locations", + }, + "details_production_companies": { + "type": "text", + "description": "Production companies", + }, + "details_official_site": { + "type": "url", + "description": "Official website URL", + }, + # Technical specs + "specs_color": { + "type": "text", + "description": "Color format (Color, Black and White)", + }, + "specs_sound_mix": { + "type": "text", + "description": "Sound mix format", + }, + "specs_aspect_ratio": { + "type": "text", + "description": "Aspect ratio", + }, + # Media + "poster_url": { + "type": "url", + "description": "Movie poster image URL", + }, + "videos": { + "type": "array", + "description": "Video links (trailers, clips)", + }, + "photos": { + "type": "array", + "description": "Photo gallery links", + }, + # Awards & box office + "awards": { + "type": "text", + "description": "Awards and nominations", + }, + "boxoffice_budget": { + "type": "text", + "description": "Production budget", + }, + } + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + + @classmethod + def get_field_names(cls) -> list: + """Get list of all field names.""" + return list(cls.FIELDS.keys()) + + @classmethod + def get_fields_by_type(cls, field_type: str) -> list: + """Get fields of a specific type (text, number, array, object, url, boolean).""" + return [name for name, info in cls.FIELDS.items() if info.get("type") == field_type] + + @classmethod + def get_rating_fields(cls) -> list: + """Get all rating-related fields.""" + rating_keywords = ["rating", "review", "score"] + return [ + name for name in cls.FIELDS.keys() if any(kw in name.lower() for kw in rating_keywords) + ] diff --git a/src/brightdata/datasets/linkedin/__init__.py b/src/brightdata/datasets/linkedin/__init__.py new file mode 100644 index 0000000..7b4eacf --- /dev/null +++ b/src/brightdata/datasets/linkedin/__init__.py @@ -0,0 +1,6 @@ +"""LinkedIn datasets.""" + +from .people_profiles import LinkedInPeopleProfiles +from .company_profiles import LinkedInCompanyProfiles + +__all__ = ["LinkedInPeopleProfiles", "LinkedInCompanyProfiles"] diff --git a/src/brightdata/datasets/linkedin/company_profiles.py b/src/brightdata/datasets/linkedin/company_profiles.py new file mode 100644 index 0000000..a4c4bf2 --- /dev/null +++ b/src/brightdata/datasets/linkedin/company_profiles.py @@ -0,0 +1,197 @@ +""" +LinkedIn Company Profiles dataset. + +Dataset ID: gd_l1vikfnt1wgvvqz95w +Records: 58.5M+ companies + +See FIELDS dict for all filterable fields with descriptions. +""" + +from typing import TYPE_CHECKING, Dict, Any + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class LinkedInCompanyProfiles(BaseDataset): + """ + LinkedIn Company Profiles dataset. + + Access 58.5M+ LinkedIn company records with filtering. + + Example: + >>> companies = client.datasets.linkedin_companies + >>> metadata = await companies.get_metadata() + >>> snapshot_id = await companies.filter( + ... filter={"name": "company_size", "operator": "=", "value": "1001-5000"}, + ... records_limit=100 + ... ) + >>> data = await companies.download(snapshot_id) + """ + + DATASET_ID = "gd_l1vikfnt1wgvvqz95w" + NAME = "linkedin_company_profiles" + + # All available fields with metadata + # Format: field_name -> {"type": str, "description": str} + FIELDS: Dict[str, Dict[str, Any]] = { + "id": { + "type": "text", + "description": "Unique identifier for the company profile (URL slug)", + }, + "name": { + "type": "text", + "description": "Company name", + }, + "country_code": { + "type": "text", + "description": "Two-letter country code (e.g., US, GB, FR)", + }, + "locations": { + "type": "array", + "description": "List of company office locations with addresses", + }, + "followers": { + "type": "number", + "description": "Number of LinkedIn followers", + }, + "employees_in_linkedin": { + "type": "number", + "description": "Number of employees with LinkedIn profiles", + }, + "about": { + "type": "text", + "description": "Company description/about section", + }, + "specialties": { + "type": "array", + "description": "List of company specialties/expertise areas", + }, + "company_size": { + "type": "text", + "description": "Employee count range (e.g., '1001-5000 employees')", + }, + "organization_type": { + "type": "text", + "description": "Type of organization (e.g., Public Company, Private)", + }, + "industries": { + "type": "text", + "description": "Primary industry classification", + }, + "website": { + "type": "url", + "description": "Company website URL", + }, + "crunchbase_url": { + "type": "url", + "description": "Link to Crunchbase profile if available", + }, + "founded": { + "type": "number", + "description": "Year the company was founded", + }, + "company_id": { + "type": "text", + "description": "LinkedIn numeric company ID", + }, + "employees": { + "type": "array", + "description": "List of employee profiles with basic info", + }, + "headquarters": { + "type": "text", + "description": "City/region of company headquarters", + }, + "image": { + "type": "url", + "description": "Company cover/banner image URL", + }, + "logo": { + "type": "url", + "description": "Company logo image URL", + }, + "similar": { + "type": "array", + "description": "Similar companies suggested by LinkedIn", + }, + "url": { + "type": "url", + "description": "Full LinkedIn company profile URL", + }, + "updates": { + "type": "array", + "description": "Recent company posts/updates", + }, + "slogan": { + "type": "text", + "description": "Company tagline or slogan", + }, + "affiliated": { + "type": "array", + "description": "Affiliated/subsidiary companies", + }, + "funding": { + "type": "object", + "description": "Funding information if available", + }, + "investors": { + "type": "array", + "description": "List of investors if available", + }, + "formatted_locations": { + "type": "array", + "description": "Formatted address strings for locations", + }, + "stock_info": { + "type": "object", + "description": "Stock ticker and exchange info for public companies", + }, + "get_directions_url": { + "type": "array", + "description": "Map/directions URLs for office locations", + }, + "description": { + "type": "text", + "description": "Brief company description with follower count", + }, + "additional_information": { + "type": "object", + "description": "Extra company details and metadata", + }, + "country_codes_array": { + "type": "array", + "description": "All country codes where company operates", + }, + "alumni": { + "type": "array", + "description": "Notable alumni from the company", + }, + "alumni_information": { + "type": "object", + "description": "Statistics about company alumni", + }, + "website_simplified": { + "type": "text", + "description": "Simplified/masked website domain", + }, + "unformatted_about": { + "type": "text", + "description": "Raw about text without formatting", + }, + } + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + + @classmethod + def get_field_names(cls) -> list: + """Get list of all field names.""" + return list(cls.FIELDS.keys()) + + @classmethod + def get_text_fields(cls) -> list: + """Get fields that are text type (commonly used for filtering).""" + return [name for name, info in cls.FIELDS.items() if info.get("type") == "text"] diff --git a/src/brightdata/datasets/linkedin/people_profiles.py b/src/brightdata/datasets/linkedin/people_profiles.py new file mode 100644 index 0000000..6c90bca --- /dev/null +++ b/src/brightdata/datasets/linkedin/people_profiles.py @@ -0,0 +1,285 @@ +""" +LinkedIn People Profiles dataset. + +Dataset ID: gd_l1viktl72bvl7bjuj0 +Records: 620M+ profiles + +See FIELDS dict for all filterable fields with descriptions and fill rates. +""" + +from typing import TYPE_CHECKING, Dict, Any + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class LinkedInPeopleProfiles(BaseDataset): + """ + LinkedIn People Profiles dataset. + + Access 620M+ LinkedIn profile records with filtering. + + Example: + >>> profiles = client.datasets.linkedin_profiles + >>> metadata = await profiles.get_metadata() + >>> snapshot_id = await profiles.filter( + ... filter={"name": "industry", "operator": "=", "value": "Technology"}, + ... records_limit=100 + ... ) + >>> data = await profiles.download(snapshot_id) + """ + + DATASET_ID = "gd_l1viktl72bvl7bjuj0" + NAME = "linkedin_people_profiles" + + # All available fields with metadata + # Format: field_name -> {"type": str, "description": str, "fill_rate": float} + FIELDS: Dict[str, Dict[str, Any]] = { + "id": { + "type": "text", + "description": "A unique identifier for the person's LinkedIn profile", + "fill_rate": 100.00, + }, + "name": { + "type": "text", + "description": "Profile name", + "fill_rate": 97.54, + }, + "first_name": { + "type": "text", + "description": "First name of the user", + "fill_rate": 95.10, + }, + "last_name": { + "type": "text", + "description": "Last name of the user", + "fill_rate": 94.80, + }, + "city": { + "type": "text", + "description": "Geographical location of the user", + "fill_rate": 96.30, + }, + "country_code": { + "type": "text", + "description": "Geographical location of the user", + "fill_rate": 97.11, + }, + "location": { + "type": "text", + "description": "Geographical location of the user", + "fill_rate": 61.93, + }, + "position": { + "type": "text", + "description": "The current job title or position of the profile", + "fill_rate": 91.23, + }, + "about": { + "type": "text", + "description": "A concise profile summary. May be truncated with '...'", + "fill_rate": 18.90, + }, + "url": { + "type": "url", + "description": "URL that links directly to the LinkedIn profile", + "fill_rate": 100.00, + }, + "input_url": { + "type": "url", + "description": "The URL that was entered when starting the scraping process", + "fill_rate": 100.00, + }, + "linkedin_id": { + "type": "text", + "description": "LinkedIn profile identifier", + "fill_rate": 100.00, + }, + "linkedin_num_id": { + "type": "text", + "description": "Numeric LinkedIn profile ID", + "fill_rate": 100.00, + }, + "avatar": { + "type": "url", + "description": "URL that links to the profile picture of the LinkedIn user", + "fill_rate": 96.28, + }, + "banner_image": { + "type": "url", + "description": "Banner image URL", + "fill_rate": 96.28, + }, + "default_avatar": { + "type": "boolean", + "description": "Is the avatar picture the default empty picture", + "fill_rate": 95.73, + }, + "followers": { + "type": "number", + "description": "How many users/companies following the profile", + "fill_rate": 71.39, + }, + "connections": { + "type": "number", + "description": "How many connections the profile has", + "fill_rate": 70.33, + }, + "recommendations_count": { + "type": "number", + "description": "Total number of recommendations received", + "fill_rate": 3.65, + }, + "influencer": { + "type": "boolean", + "description": "Indicator if the profile is marked as influencer", + "fill_rate": 46.06, + }, + "memorialized_account": { + "type": "boolean", + "description": "Boolean indicating if the account is memorialized", + "fill_rate": 99.44, + }, + # Current company fields + "current_company_name": { + "type": "text", + "description": "The name of the latest/current company of the profile", + "fill_rate": 69.60, + }, + "current_company_company_id": { + "type": "text", + "description": "The id of the latest/current company of the profile", + "fill_rate": 38.94, + }, + "current_company": { + "type": "object", + "description": "Current professional position info: company name, job title, company ID, industry", + "fill_rate": 100.00, + "nested_fields": 6, + }, + # Experience & Education + "experience": { + "type": "array", + "description": "Professional history: job titles, dates, companies, locations", + "fill_rate": 71.49, + "nested_fields": 16, + }, + "education": { + "type": "array", + "description": "Educational background: degree, field, start/end year", + "fill_rate": 41.97, + "nested_fields": 10, + }, + "educations_details": { + "type": "text", + "description": "Educational background as text", + "fill_rate": 42.08, + }, + # Activity & Posts + "posts": { + "type": "array", + "description": "User's last LinkedIn posts: title, date, URL", + "fill_rate": 1.27, + "nested_fields": 7, + }, + "activity": { + "type": "array", + "description": "Any activity the user has regarding posts", + "fill_rate": 32.95, + "nested_fields": 5, + }, + # Professional credentials + "certifications": { + "type": "array", + "description": "Licenses & Certifications", + "fill_rate": 8.35, + "nested_fields": 5, + }, + "courses": { + "type": "array", + "description": "Courses or educational programs undertaken", + "fill_rate": 2.55, + "nested_fields": 3, + }, + "languages": { + "type": "array", + "description": "User's language proficiencies", + "fill_rate": 9.19, + "nested_fields": 2, + }, + "publications": { + "type": "array", + "description": "Published works or presentations", + "fill_rate": 1.23, + "nested_fields": 4, + }, + "patents": { + "type": "array", + "description": "Patents filed or granted", + "fill_rate": 0.13, + "nested_fields": 4, + }, + "projects": { + "type": "array", + "description": "Professional or academic projects", + "fill_rate": 2.08, + "nested_fields": 4, + }, + "honors_and_awards": { + "type": "array", + "description": "Awards and recognitions received", + "fill_rate": 2.13, + "nested_fields": 4, + }, + # Social & Network + "recommendations": { + "type": "array", + "description": "Recommendations received from connections/colleagues", + "fill_rate": 3.61, + }, + "volunteer_experience": { + "type": "array", + "description": "Information related to volunteer work", + "fill_rate": 4.12, + "nested_fields": 8, + }, + "organizations": { + "type": "array", + "description": "Memberships in professional organizations", + "fill_rate": 1.78, + "nested_fields": 6, + }, + "people_also_viewed": { + "type": "array", + "description": "Profiles that viewers of this profile also viewed", + "fill_rate": 33.36, + "nested_fields": 4, + }, + "similar_profiles": { + "type": "array", + "description": "Profiles similar to the current one", + "fill_rate": 0.58, + "nested_fields": 4, + }, + "bio_links": { + "type": "array", + "description": "External links added to the bio", + "fill_rate": 2.96, + "nested_fields": 2, + }, + } + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + + @classmethod + def get_field_names(cls) -> list: + """Get list of all field names.""" + return list(cls.FIELDS.keys()) + + @classmethod + def get_high_fill_rate_fields(cls, min_rate: float = 50.0) -> list: + """Get fields with fill rate above threshold.""" + return [name for name, info in cls.FIELDS.items() if info.get("fill_rate", 0) >= min_rate] diff --git a/src/brightdata/datasets/models.py b/src/brightdata/datasets/models.py new file mode 100644 index 0000000..b2b7786 --- /dev/null +++ b/src/brightdata/datasets/models.py @@ -0,0 +1,73 @@ +""" +Data models for Datasets API responses. +""" + +from dataclasses import dataclass, field +from typing import Dict, Optional, Any, Literal + + +@dataclass +class DatasetInfo: + """Dataset info returned by list().""" + + id: str + name: str + size: int = 0 # record count + + +@dataclass +class DatasetField: + """Field metadata within a dataset.""" + + type: str # "text", "number", "url", "array", "object", "boolean" + active: bool = True + required: bool = False + description: Optional[str] = None + + +@dataclass +class DatasetMetadata: + """Dataset metadata returned by get_metadata().""" + + id: str + fields: Dict[str, DatasetField] = field(default_factory=dict) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "DatasetMetadata": + """Create from API response.""" + fields = {} + for name, field_data in data.get("fields", {}).items(): + if isinstance(field_data, dict): + fields[name] = DatasetField( + type=field_data.get("type", "text"), + active=field_data.get("active", True), + required=field_data.get("required", False), + description=field_data.get("description"), + ) + return cls(id=data.get("id", ""), fields=fields) + + +@dataclass +class SnapshotStatus: + """Snapshot status returned by get_status().""" + + id: str + status: Literal["scheduled", "building", "ready", "failed"] + dataset_id: Optional[str] = None + dataset_size: Optional[int] = None # records in snapshot + file_size: Optional[int] = None # bytes + cost: Optional[float] = None + error: Optional[str] = None + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "SnapshotStatus": + """Create from API response.""" + return cls( + id=data.get("id", data.get("snapshot_id", "")), + status=data.get("status", "scheduled"), + dataset_id=data.get("dataset_id"), + dataset_size=data.get("dataset_size"), + file_size=data.get("file_size"), + cost=data.get("cost"), + error=data.get("error", data.get("error_message")), + ) diff --git a/src/brightdata/datasets/nba/__init__.py b/src/brightdata/datasets/nba/__init__.py new file mode 100644 index 0000000..f1071ba --- /dev/null +++ b/src/brightdata/datasets/nba/__init__.py @@ -0,0 +1,5 @@ +"""NBA dataset.""" + +from .players_stats import NBAPlayersStats + +__all__ = ["NBAPlayersStats"] diff --git a/src/brightdata/datasets/nba/players_stats.py b/src/brightdata/datasets/nba/players_stats.py new file mode 100644 index 0000000..b4f5ca5 --- /dev/null +++ b/src/brightdata/datasets/nba/players_stats.py @@ -0,0 +1,136 @@ +""" +NBA Players Stats dataset. + +Dataset ID: gd_lrqirmftwxxatiorf + +See FIELDS dict for all filterable fields with descriptions. +""" + +from typing import TYPE_CHECKING, Dict, Any + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class NBAPlayersStats(BaseDataset): + """ + NBA Players Stats dataset. + + Access NBA player statistics with filtering. + + Example: + >>> players = client.datasets.nba_players_stats + >>> metadata = await players.get_metadata() + >>> snapshot_id = await players.filter( + ... filter={"name": "player_points_per_game", "operator": ">", "value": 20}, + ... records_limit=100 + ... ) + >>> data = await players.download(snapshot_id) + """ + + DATASET_ID = "gd_lrqirmftwxxatiorf" + NAME = "nba_players_stats" + + # All available fields with metadata + FIELDS: Dict[str, Dict[str, Any]] = { + # Player identification + "url": { + "type": "url", + "description": "ESPN player stats page URL", + }, + "player_name": { + "type": "text", + "description": "Player full name", + }, + "team": { + "type": "text", + "description": "Team abbreviation (e.g., LAL, GSW)", + }, + # Season info + "season_year": { + "type": "text", + "description": "Season year (e.g., 2024-25)", + }, + "season_type": { + "type": "text", + "description": "Season type (Regular, Playoffs)", + }, + # Games + "player_games_played": { + "type": "number", + "description": "Number of games played", + }, + "player_games_started": { + "type": "number", + "description": "Number of games started", + }, + "player_minutes_per_game": { + "type": "number", + "description": "Minutes played per game", + }, + # Scoring + "player_points_per_game": { + "type": "number", + "description": "Points scored per game", + }, + # Rebounds + "player_offensive_rebounds_per_game": { + "type": "number", + "description": "Offensive rebounds per game", + }, + "player_defensive_rebounds_per_game": { + "type": "number", + "description": "Defensive rebounds per game", + }, + "player_rebounds_per_game": { + "type": "number", + "description": "Total rebounds per game", + }, + # Assists & turnovers + "player_assists_per_game": { + "type": "number", + "description": "Assists per game", + }, + "player_turnovers_per_game": { + "type": "number", + "description": "Turnovers per game", + }, + "player_assist_to_turnover_ratio": { + "type": "number", + "description": "Assist to turnover ratio", + }, + # Defense + "player_steals_per_game": { + "type": "number", + "description": "Steals per game", + }, + "player_blocks_per_game": { + "type": "number", + "description": "Blocks per game", + }, + # Fouls + "player_fouls_per_game": { + "type": "number", + "description": "Personal fouls per game", + }, + } + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + + @classmethod + def get_field_names(cls) -> list: + """Get list of all field names.""" + return list(cls.FIELDS.keys()) + + @classmethod + def get_fields_by_type(cls, field_type: str) -> list: + """Get fields of a specific type (text, number, array, object, url, boolean).""" + return [name for name, info in cls.FIELDS.items() if info.get("type") == field_type] + + @classmethod + def get_per_game_stats(cls) -> list: + """Get all per-game statistics fields.""" + return [name for name in cls.FIELDS.keys() if "per_game" in name.lower()] diff --git a/src/brightdata/datasets/utils.py b/src/brightdata/datasets/utils.py new file mode 100644 index 0000000..c5b70a5 --- /dev/null +++ b/src/brightdata/datasets/utils.py @@ -0,0 +1,139 @@ +""" +Dataset utilities - helpers for exporting and processing dataset results. +""" + +import json +import csv +from pathlib import Path +from typing import List, Dict, Any, Union, Optional + + +def export_json( + data: List[Dict[str, Any]], + filepath: Union[str, Path], + indent: int = 2, +) -> Path: + """ + Export dataset results to JSON file. + + Args: + data: List of records from download() + filepath: Output file path + indent: JSON indentation (default: 2) + + Returns: + Path to the created file + """ + filepath = Path(filepath) + with open(filepath, "w", encoding="utf-8") as f: + json.dump(data, f, indent=indent, default=str, ensure_ascii=False) + return filepath + + +def export_jsonl( + data: List[Dict[str, Any]], + filepath: Union[str, Path], +) -> Path: + """ + Export dataset results to JSONL (newline-delimited JSON) file. + + Args: + data: List of records from download() + filepath: Output file path + + Returns: + Path to the created file + """ + filepath = Path(filepath) + with open(filepath, "w", encoding="utf-8") as f: + for record in data: + f.write(json.dumps(record, default=str, ensure_ascii=False) + "\n") + return filepath + + +def export_csv( + data: List[Dict[str, Any]], + filepath: Union[str, Path], + fields: Optional[List[str]] = None, + flatten_nested: bool = True, +) -> Path: + """ + Export dataset results to CSV file. + + Args: + data: List of records from download() + filepath: Output file path + fields: Specific fields to export (default: all fields from first record) + flatten_nested: Convert nested objects/arrays to JSON strings (default: True) + + Returns: + Path to the created file + """ + if not data: + filepath = Path(filepath) + filepath.touch() + return filepath + + filepath = Path(filepath) + + # Determine fields + if fields is None: + fields = list(data[0].keys()) + + # Process data + processed_data = [] + for record in data: + row = {} + for field in fields: + value = record.get(field) + if flatten_nested and isinstance(value, (dict, list)): + value = json.dumps(value, default=str, ensure_ascii=False) + row[field] = value + processed_data.append(row) + + # Write CSV + with open(filepath, "w", newline="", encoding="utf-8") as f: + writer = csv.DictWriter(f, fieldnames=fields) + writer.writeheader() + writer.writerows(processed_data) + + return filepath + + +def export( + data: List[Dict[str, Any]], + filepath: Union[str, Path], + **kwargs, +) -> Path: + """ + Export dataset results to file. Format is auto-detected from extension. + + Supported formats: + - .json: JSON format + - .jsonl, .ndjson: JSONL (newline-delimited JSON) + - .csv: CSV format + + Args: + data: List of records from download() + filepath: Output file path (extension determines format) + **kwargs: Additional arguments passed to format-specific exporter + + Returns: + Path to the created file + + Raises: + ValueError: If file extension is not supported + """ + filepath = Path(filepath) + ext = filepath.suffix.lower() + + if ext == ".json": + return export_json(data, filepath, **kwargs) + elif ext in (".jsonl", ".ndjson"): + return export_jsonl(data, filepath) + elif ext == ".csv": + return export_csv(data, filepath, **kwargs) + else: + raise ValueError( + f"Unsupported file extension: {ext}. " f"Supported: .json, .jsonl, .ndjson, .csv" + ) diff --git a/src/brightdata/datasets/world_population/__init__.py b/src/brightdata/datasets/world_population/__init__.py new file mode 100644 index 0000000..c27d252 --- /dev/null +++ b/src/brightdata/datasets/world_population/__init__.py @@ -0,0 +1,5 @@ +"""World Population dataset.""" + +from .countries import WorldPopulation + +__all__ = ["WorldPopulation"] diff --git a/src/brightdata/datasets/world_population/countries.py b/src/brightdata/datasets/world_population/countries.py new file mode 100644 index 0000000..44833b7 --- /dev/null +++ b/src/brightdata/datasets/world_population/countries.py @@ -0,0 +1,155 @@ +""" +World Population dataset. + +Dataset ID: gd_lrqeq7u3bil0pmelk + +See FIELDS dict for all filterable fields with descriptions. +""" + +from typing import TYPE_CHECKING, Dict, Any + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class WorldPopulation(BaseDataset): + """ + World Population dataset. + + Access world population statistics by country with filtering. + + Example: + >>> population = client.datasets.world_population + >>> metadata = await population.get_metadata() + >>> snapshot_id = await population.filter( + ... filter={"name": "continent", "operator": "=", "value": "Europe"}, + ... records_limit=100 + ... ) + >>> data = await population.download(snapshot_id) + """ + + DATASET_ID = "gd_lrqeq7u3bil0pmelk" + NAME = "world_population" + + # All available fields with metadata + FIELDS: Dict[str, Dict[str, Any]] = { + # Country identification + "url": { + "type": "url", + "description": "Country page URL", + }, + "country": { + "type": "text", + "description": "Country name", + }, + "abbreviation": { + "type": "text", + "description": "Country code (e.g., USA, GBR)", + }, + "flag_image": { + "type": "url", + "description": "Country flag image URL", + }, + # Geographic info + "capital": { + "type": "text", + "description": "Capital city", + }, + "continent": { + "type": "text", + "description": "Continent name", + }, + "regions": { + "type": "array", + "description": "Geographic regions", + }, + "largest_cities": { + "type": "array", + "description": "Largest cities in the country", + }, + # Area + "country_area": { + "type": "number", + "description": "Total area (km²)", + }, + "country_land_area": { + "type": "number", + "description": "Land area (km²)", + }, + "country_density": { + "type": "number", + "description": "Population density per km²", + }, + # Population + "last_year_population": { + "type": "number", + "description": "Population from last year", + }, + "country_population_rank": { + "type": "number", + "description": "World population rank", + }, + "population_world_percentage": { + "type": "number", + "description": "Percentage of world population", + }, + "population_by_year": { + "type": "object", + "description": "Historical population data by year", + }, + # Population changes + "annual_population_growth": { + "type": "text", + "description": "Annual population growth rate and count", + }, + "population_change": { + "type": "number", + "description": "Total population change", + }, + "net_change_per_day": { + "type": "number", + "description": "Net population change per day", + }, + # Demographics + "births_per_day": { + "type": "number", + "description": "Average births per day", + }, + "deaths_per_day": { + "type": "number", + "description": "Average deaths per day", + }, + "emigrations_per_day": { + "type": "number", + "description": "Average emigrations per day", + }, + } + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + + @classmethod + def get_field_names(cls) -> list: + """Get list of all field names.""" + return list(cls.FIELDS.keys()) + + @classmethod + def get_fields_by_type(cls, field_type: str) -> list: + """Get fields of a specific type (text, number, array, object, url, boolean).""" + return [name for name, info in cls.FIELDS.items() if info.get("type") == field_type] + + @classmethod + def get_population_fields(cls) -> list: + """Get all population-related fields.""" + return [name for name in cls.FIELDS.keys() if "population" in name.lower()] + + @classmethod + def get_demographic_fields(cls) -> list: + """Get demographic fields (births, deaths, migrations).""" + return [ + name + for name in cls.FIELDS.keys() + if any(kw in name.lower() for kw in ["birth", "death", "emigration", "change"]) + ] diff --git a/src/brightdata/utils/ssl_helpers.py b/src/brightdata/utils/ssl_helpers.py index 482966f..a709651 100644 --- a/src/brightdata/utils/ssl_helpers.py +++ b/src/brightdata/utils/ssl_helpers.py @@ -92,7 +92,7 @@ def get_ssl_error_message(error: Exception) -> str: if is_macos(): fix_instructions = """ - + To fix this on macOS, try one of the following: 1. Install/upgrade certifi: @@ -112,7 +112,7 @@ def get_ssl_error_message(error: Exception) -> str: """ else: fix_instructions = """ - + To fix this, try: 1. Install/upgrade certifi: From 0f7548da610baf52810f6884c8263f614c451604 Mon Sep 17 00:00:00 2001 From: "user.mail" Date: Mon, 9 Feb 2026 17:03:52 +0300 Subject: [PATCH 2/2] Simplify API: remove .filter(), use callable datasets --- CHANGELOG.md | 2 +- MANIFEST.in | 1 - notebooks/datasets/amazon/amazon.ipynb | 8 ++++---- notebooks/datasets/crunchbase/crunchbase.ipynb | 14 +++++++------- notebooks/datasets/linkedin/linkedin.ipynb | 6 +++--- requirements-dev.txt | 1 - src/brightdata/datasets/base.py | 9 +++++---- 7 files changed, 20 insertions(+), 21 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ee5c5dd..662b366 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,7 +10,7 @@ Access Bright Data's pre-collected datasets with filtering and export capabiliti ```python async with BrightDataClient() as client: # Filter dataset records - snapshot_id = await client.datasets.amazon_products.filter( + snapshot_id = await client.datasets.amazon_products( filter={"name": "rating", "operator": ">=", "value": 4.5}, records_limit=100 ) diff --git a/MANIFEST.in b/MANIFEST.in index 37ee2c5..63958da 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -4,4 +4,3 @@ include CHANGELOG.md include pyproject.toml recursive-include src *.py recursive-include src *.typed - diff --git a/notebooks/datasets/amazon/amazon.ipynb b/notebooks/datasets/amazon/amazon.ipynb index 9ca9e47..f5680c7 100644 --- a/notebooks/datasets/amazon/amazon.ipynb +++ b/notebooks/datasets/amazon/amazon.ipynb @@ -217,7 +217,7 @@ "print(f\"Records limit: {LIMIT}\\n\")\n", "\n", "async with client:\n", - " snapshot_id = await client.datasets.amazon_products.filter(\n", + " snapshot_id = await client.datasets.amazon_products(\n", " filter=FILTER,\n", " records_limit=LIMIT\n", " )\n", @@ -304,7 +304,7 @@ "print(f\"Records limit: 5\\n\")\n", "\n", "async with client:\n", - " snapshot_id = await client.datasets.amazon_products.filter(\n", + " snapshot_id = await client.datasets.amazon_products(\n", " filter=PRICE_FILTER,\n", " records_limit=5\n", " )\n", @@ -358,7 +358,7 @@ "print(f\"Records limit: 5\\n\")\n", "\n", "async with client:\n", - " snapshot_id = await client.datasets.amazon_products.filter(\n", + " snapshot_id = await client.datasets.amazon_products(\n", " filter=PRIME_FILTER,\n", " records_limit=5\n", " )\n", @@ -413,7 +413,7 @@ "print(f\"Records limit: 5\\n\")\n", "\n", "async with client:\n", - " snapshot_id = await client.datasets.amazon_products.filter(\n", + " snapshot_id = await client.datasets.amazon_products(\n", " filter=BRAND_FILTER,\n", " records_limit=5\n", " )\n", diff --git a/notebooks/datasets/crunchbase/crunchbase.ipynb b/notebooks/datasets/crunchbase/crunchbase.ipynb index 0babefd..c87ea2c 100644 --- a/notebooks/datasets/crunchbase/crunchbase.ipynb +++ b/notebooks/datasets/crunchbase/crunchbase.ipynb @@ -188,7 +188,7 @@ "print(f\"Records limit: {LIMIT}\\n\")\n", "\n", "async with client:\n", - " snapshot_id = await client.datasets.crunchbase_companies.filter(\n", + " snapshot_id = await client.datasets.crunchbase_companies(\n", " filter=FILTER,\n", " records_limit=LIMIT\n", " )\n", @@ -232,7 +232,7 @@ "print(f\"Records limit: 5\\n\")\n", "\n", "async with client:\n", - " snapshot_id = await client.datasets.crunchbase_companies.filter(\n", + " snapshot_id = await client.datasets.crunchbase_companies(\n", " filter=EMPLOYEE_FILTER,\n", " records_limit=5\n", " )\n", @@ -276,7 +276,7 @@ "print(f\"Records limit: 5\\n\")\n", "\n", "async with client:\n", - " snapshot_id = await client.datasets.crunchbase_companies.filter(\n", + " snapshot_id = await client.datasets.crunchbase_companies(\n", " filter=COUNTRY_FILTER,\n", " records_limit=5\n", " )\n", @@ -319,7 +319,7 @@ "print(f\"Records limit: 5\\n\")\n", "\n", "async with client:\n", - " snapshot_id = await client.datasets.crunchbase_companies.filter(\n", + " snapshot_id = await client.datasets.crunchbase_companies(\n", " filter=FUNDED_FILTER,\n", " records_limit=5\n", " )\n", @@ -365,7 +365,7 @@ "print(f\"Records limit: 5\\n\")\n", "\n", "async with client:\n", - " snapshot_id = await client.datasets.crunchbase_companies.filter(\n", + " snapshot_id = await client.datasets.crunchbase_companies(\n", " filter=IPO_FILTER,\n", " records_limit=5\n", " )\n", @@ -416,7 +416,7 @@ "print(f\"Records limit: 5\\n\")\n", "\n", "async with client:\n", - " snapshot_id = await client.datasets.crunchbase_companies.filter(\n", + " snapshot_id = await client.datasets.crunchbase_companies(\n", " filter=FUNDED_FILTER,\n", " records_limit=5\n", " )\n", @@ -458,7 +458,7 @@ "print(f\"Records limit: 5\\n\")\n", "\n", "async with client:\n", - " snapshot_id = await client.datasets.crunchbase_companies.filter(\n", + " snapshot_id = await client.datasets.crunchbase_companies(\n", " filter=IPO_FILTER,\n", " records_limit=5\n", " )\n", diff --git a/notebooks/datasets/linkedin/linkedin.ipynb b/notebooks/datasets/linkedin/linkedin.ipynb index d439317..ee90d6a 100644 --- a/notebooks/datasets/linkedin/linkedin.ipynb +++ b/notebooks/datasets/linkedin/linkedin.ipynb @@ -431,7 +431,7 @@ "print(f\"Records limit: {LIMIT}\\n\")\n", "\n", "async with client:\n", - " snapshot_id = await client.datasets.linkedin_profiles.filter(\n", + " snapshot_id = await client.datasets.linkedin_profiles(\n", " filter=FILTER,\n", " records_limit=LIMIT\n", " )\n", @@ -586,7 +586,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": "# Step 1: Create filter\nCOMBINED_FILTER = {\n \"operator\": \"and\",\n \"filters\": [\n {\"name\": \"country_code\", \"operator\": \"=\", \"value\": \"US\"},\n {\"name\": \"followers\", \"operator\": \">\", \"value\": 5000}\n ]\n}\n\nprint(\"Filter: US-based profiles with 5000+ followers\")\nprint(f\"Records limit: 5\\n\")\n\nasync with client:\n snapshot_id = await client.datasets.linkedin_profiles.filter(\n filter=COMBINED_FILTER,\n records_limit=5\n )\n\nprint(f\"Snapshot created: {snapshot_id}\")" + "source": "# Step 1: Create filter\nCOMBINED_FILTER = {\n \"operator\": \"and\",\n \"filters\": [\n {\"name\": \"country_code\", \"operator\": \"=\", \"value\": \"US\"},\n {\"name\": \"followers\", \"operator\": \">\", \"value\": 5000}\n ]\n}\n\nprint(\"Filter: US-based profiles with 5000+ followers\")\nprint(f\"Records limit: 5\\n\")\n\nasync with client:\n snapshot_id = await client.datasets.linkedin_profiles(\n filter=COMBINED_FILTER,\n records_limit=5\n )\n\nprint(f\"Snapshot created: {snapshot_id}\")" }, { "cell_type": "code", @@ -610,7 +610,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": "# Step 1: Create filter\nCOMPANY_FILTER = {\n \"name\": \"company_size\",\n \"operator\": \"=\",\n \"value\": \"1001-5000 employees\"\n}\n\nprint(f\"Filter: {COMPANY_FILTER}\")\nprint(f\"Records limit: 5\\n\")\n\nasync with client:\n snapshot_id = await client.datasets.linkedin_companies.filter(\n filter=COMPANY_FILTER,\n records_limit=5\n )\n\nprint(f\"Snapshot created: {snapshot_id}\")" + "source": "# Step 1: Create filter\nCOMPANY_FILTER = {\n \"name\": \"company_size\",\n \"operator\": \"=\",\n \"value\": \"1001-5000 employees\"\n}\n\nprint(f\"Filter: {COMPANY_FILTER}\")\nprint(f\"Records limit: 5\\n\")\n\nasync with client:\n snapshot_id = await client.datasets.linkedin_companies(\n filter=COMPANY_FILTER,\n records_limit=5\n )\n\nprint(f\"Snapshot created: {snapshot_id}\")" }, { "cell_type": "code", diff --git a/requirements-dev.txt b/requirements-dev.txt index 5fc90a0..431ef73 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -7,4 +7,3 @@ black>=23.0.0 ruff>=0.1.0 mypy>=1.5.0 pre-commit>=3.4.0 - diff --git a/src/brightdata/datasets/base.py b/src/brightdata/datasets/base.py index 62e008f..0517ce6 100644 --- a/src/brightdata/datasets/base.py +++ b/src/brightdata/datasets/base.py @@ -22,7 +22,8 @@ class BaseDataset: """ Base class for all dataset types. - Provides common methods: get_metadata(), filter(), get_status(), download(). + Provides common methods: get_metadata(), get_status(), download(). + Call the dataset directly to filter: await dataset(filter=..., records_limit=...) Subclasses set their own DATASET_ID and can add dataset-specific helpers. """ @@ -60,7 +61,7 @@ async def get_metadata(self) -> DatasetMetadata: self._metadata = DatasetMetadata.from_dict(data) return self._metadata - async def filter( + async def __call__( self, filter: Dict[str, Any], records_limit: Optional[int] = None, @@ -106,7 +107,7 @@ async def get_status(self, snapshot_id: str) -> SnapshotStatus: Check snapshot status. Args: - snapshot_id: Snapshot ID from filter() + snapshot_id: Snapshot ID from calling the dataset Returns: SnapshotStatus with status field: "scheduled", "building", "ready", or "failed" @@ -130,7 +131,7 @@ async def download( Polls until snapshot is ready, then downloads and returns data. Args: - snapshot_id: Snapshot ID from filter() + snapshot_id: Snapshot ID from calling the dataset format: Response format (json, jsonl, csv) timeout: Max seconds to wait for snapshot to be ready poll_interval: Seconds between status checks