diff --git a/.github/workflows/bench-pr.yml b/.github/workflows/bench-pr.yml index 6e7dc02a66f..7e6cd69b4aa 100644 --- a/.github/workflows/bench-pr.yml +++ b/.github/workflows/bench-pr.yml @@ -54,16 +54,6 @@ jobs: - uses: actions/checkout@v6 with: ref: ${{ github.event.pull_request.head.sha }} - - uses: dorny/paths-filter@v3 - id: duckdb-buildfile-changed - with: - filters: | - changed: - - 'vortex-duckdb/build.rs' - - name: Install curl dev headers for duckdb source build - if: steps.duckdb-buildfile-changed.outputs.changed == 'true' - run: | - sudo apt-get update -y -qq && sudo apt-get install -y -qq libcurl4-openssl-dev - uses: ./.github/actions/setup-rust with: repo-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index b583aed6c65..9743a554eae 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -56,16 +56,6 @@ jobs: with: sccache: s3 - uses: actions/checkout@v6 - - uses: dorny/paths-filter@v3 - id: duckdb-buildfile-changed - with: - filters: | - changed: - - 'vortex-duckdb/build.rs' - - name: Install curl dev headers for duckdb source build - if: steps.duckdb-buildfile-changed.outputs.changed == 'true' - run: | - sudo apt-get update -y -qq && sudo apt-get install -y -qq libcurl4-openssl-dev - uses: ./.github/actions/setup-rust with: repo-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 028be5c080f..a8214920da2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -188,29 +188,6 @@ jobs: - uses: ./.github/actions/setup-rust with: repo-token: ${{ secrets.GITHUB_TOKEN }} - # DuckDB build from duckdb/, if building httpfs statically, fails unless - # you have cURL dev headers present. Ideally we'd see these loaded during - # DuckDB setup itself, but this is a tricky change, see - # https://github.com/duckdb/duckdb/issues/21073 - # - # This does NOT check for DUCKDB_VERSION changes, and its use in CI - # is generally discouraged. Please change build.rs. - # - # Ideally I'd use a awalsh128/cache-apt-pkgs-action@v1.5.2, - # as apt-get takes around 22s, but this specific action is broken, see - # https://github.com/awalsh128/cache-apt-pkgs-action/issues/181 - # - # TODO(myrrc): this should all go away once we have AMI in CI. - - uses: dorny/paths-filter@v3 - id: duckdb-buildfile-changed - with: - filters: | - changed: - - 'vortex-duckdb/build.rs' - - name: Install curl dev headers for duckdb source build - if: steps.duckdb-buildfile-changed.outputs.changed == 'true' - run: | - sudo apt-get update -y -qq && sudo apt-get install -y -qq libcurl4-openssl-dev - name: Docs run: | RUSTDOCFLAGS="-D warnings" cargo doc --no-deps @@ -310,16 +287,6 @@ jobs: repo-token: ${{ secrets.GITHUB_TOKEN }} - name: Install nightly for fmt run: rustup toolchain install $NIGHTLY_TOOLCHAIN --component rustfmt - - uses: dorny/paths-filter@v3 - id: duckdb-buildfile-changed - with: - filters: | - changed: - - 'vortex-duckdb/build.rs' - - name: Install curl dev headers for duckdb source build - if: steps.duckdb-buildfile-changed.outputs.changed == 'true' - run: | - sudo apt-get update -y -qq && sudo apt-get install -y -qq libcurl4-openssl-dev - name: Rust Lint - Format run: cargo +$NIGHTLY_TOOLCHAIN fmt --all --check - name: Rustc check @@ -444,16 +411,6 @@ jobs: uses: taiki-e/install-action@v2 with: tool: nextest - - uses: dorny/paths-filter@v3 - id: duckdb-buildfile-changed - with: - filters: | - changed: - - 'vortex-duckdb/build.rs' - - name: Install curl dev headers for duckdb source build - if: steps.duckdb-buildfile-changed.outputs.changed == 'true' - run: | - sudo apt-get update -y -qq && sudo apt-get install -y -qq libcurl4-openssl-dev - name: Rust Tests if: ${{ matrix.suite == 'tests' }} run: | @@ -520,7 +477,8 @@ jobs: components: "rust-src, rustfmt, clippy, llvm-tools-preview" - name: Install build dependencies run: | - sudo apt-get update -y -qq && sudo apt-get install -y -qq ninja-build cmake libcurl4-openssl-dev + sudo apt-get update + sudo apt-get install -y ninja-build cmake - name: Install nextest uses: taiki-e/install-action@v2 with: @@ -654,16 +612,6 @@ jobs: uses: taiki-e/install-action@v2 with: tool: nextest - - uses: dorny/paths-filter@v3 - id: duckdb-buildfile-changed - with: - filters: | - changed: - - 'vortex-duckdb/build.rs' - - name: Install curl dev headers for duckdb source build - if: matrix.os != 'windows-x64' && steps.duckdb-buildfile-changed.outputs.changed == 'true' - run: | - sudo apt-get update -y -qq && sudo apt-get install -y -qq libcurl4-openssl-dev - name: Rust Tests (Windows) if: matrix.os == 'windows-x64' run: | @@ -822,20 +770,11 @@ jobs: uses: ./.github/actions/setup-rust with: repo-token: ${{ secrets.GITHUB_TOKEN }} - - uses: dorny/paths-filter@v3 - id: duckdb-buildfile-changed - with: - filters: | - changed: - - 'vortex-duckdb/build.rs' - - name: Install curl dev headers for duckdb source build - if: steps.duckdb-buildfile-changed.outputs.changed == 'true' - run: | - sudo apt-get update -y -qq && sudo apt-get install -y -qq libcurl4-openssl-dev - name: Run sqllogictest tests run: | cargo test -p vortex-sqllogictest --test sqllogictests + wasm-integration: name: "wasm-integration" runs-on: ubuntu-latest diff --git a/.github/workflows/sql-benchmarks.yml b/.github/workflows/sql-benchmarks.yml index fc784f0961f..f98e7ef87d4 100644 --- a/.github/workflows/sql-benchmarks.yml +++ b/.github/workflows/sql-benchmarks.yml @@ -115,18 +115,6 @@ jobs: - uses: actions/checkout@v6 if: inputs.mode != 'pr' - - - uses: dorny/paths-filter@v3 - id: duckdb-buildfile-changed - with: - filters: | - changed: - - 'vortex-duckdb/build.rs' - - name: Install curl dev headers for duckdb source build - if: steps.duckdb-buildfile-changed.outputs.changed == 'true' - run: | - sudo apt-get update -y -qq && sudo apt-get install -y -qq libcurl4-openssl-dev - - uses: ./.github/actions/setup-rust with: repo-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/benchmarks/duckdb-bench/src/lib.rs b/benchmarks/duckdb-bench/src/lib.rs index a92a84e9e93..5b7c4abb6d6 100644 --- a/benchmarks/duckdb-bench/src/lib.rs +++ b/benchmarks/duckdb-bench/src/lib.rs @@ -87,6 +87,10 @@ impl DuckClient { let connection = db.connect()?; vortex_duckdb::initialize(&db)?; + // Install and load httpfs so DuckDB can access remote files (S3, GCS, HTTP). + connection.query("INSTALL httpfs;")?; + connection.query("LOAD httpfs;")?; + // Enable Parquet metadata cache for all benchmark runs. // // `parquet_metadata_cache` is an extension-specific option that's diff --git a/vortex-bench/src/tpcds/duckdb.rs b/vortex-bench/src/tpcds/duckdb.rs index 3890e223339..ebdbf9fc220 100644 --- a/vortex-bench/src/tpcds/duckdb.rs +++ b/vortex-bench/src/tpcds/duckdb.rs @@ -29,6 +29,8 @@ pub fn generate_tpcds(base_dir: PathBuf, scale_factor: String) -> Result = Lazy::new(|| { // Override the DuckDB version via environment variable in case of an extension build. // `DUCKDB_VERSION` is set by the extension build in the `duckdb-vortex` repo. @@ -256,15 +252,7 @@ fn extract_duckdb_source(source_dir: &Path) -> Result Result> { - let build_type = match debug { - true => "debug", - false => "release", - }; +fn build_duckdb(duckdb_source_dir: &Path) -> Result> { // Check for ninja if Command::new("ninja").arg("--version").output().is_err() { return Err( @@ -274,12 +262,10 @@ fn build_duckdb( let inner_dir_name = DUCKDB_VERSION.archive_inner_dir_name(); let duckdb_repo_dir = duckdb_source_dir.join(&inner_dir_name); - let build_dir = duckdb_repo_dir.join("build").join(build_type); + let build_dir = duckdb_repo_dir.join("build").join("debug"); + // Check if already built let lib_dir = build_dir.join("src"); - let lib_dir_str = lib_dir.display(); - println!("cargo:info=Checking if DuckDB is already built in {lib_dir_str}",); - let already_built = lib_dir.join("libduckdb.dylib").exists() || lib_dir.join("libduckdb.so").exists() || lib_dir @@ -300,26 +286,12 @@ fn build_duckdb( ("1", "0") }; - let mut envs = vec![ - ("GEN", "ninja"), - ("DISABLE_SANITIZER", asan_option), - ("THREADSAN", tsan_option), - ("BUILD_SHELL", "false"), - ("BUILD_UNITTESTS", "false"), - ("ENABLE_UNITTEST_CPP_TESTS", "false"), - ]; - - // If we're building from a commit (likely a pre-release), we need to - // build extensions statically. Otherwise DuckDB tries to load them - // from an http endpoint with version 0.0.1 (all non-tagged builds) - // which doesn't exists. httpfs also requires CURL dev headers - if matches!(version, DuckDBVersion::Commit(_)) { - envs.push(("BUILD_EXTENSIONS", "httpfs;parquet;tpch;tpcds;jemalloc")); - }; - let output = Command::new("make") .current_dir(&duckdb_repo_dir) - .envs(envs) + .env("GEN", "ninja") + .env("DISABLE_SANITIZER", asan_option) + .env("THREADSAN", tsan_option) + .arg("debug") .output()?; if !output.status.success() { @@ -403,21 +375,15 @@ fn main() { drop(fs::remove_dir_all(&duckdb_symlink)); std::os::unix::fs::symlink(&extracted_source_path, &duckdb_symlink).unwrap(); + // Determine whether to build from source or use prebuilt libraries let use_debug_build = env::var("VX_DUCKDB_DEBUG").is_ok_and(|v| matches!(v.as_str(), "1" | "true")); - println!("cargo:info=DuckDB debug build: {use_debug_build}"); let library_path = if use_debug_build || !DUCKDB_VERSION.is_release() { // Build from source for: // - Commit hashes (no prebuilt available) // - When VX_DUCKDB_DEBUG=1 (user wants debug build) - match build_duckdb(&extracted_source_path, &DUCKDB_VERSION, use_debug_build) { - Ok(path) => path, - Err(err) => { - println!("cargo:error={err}"); - panic!("duckdb build failed"); - } - } + build_duckdb(&extracted_source_path).unwrap() } else { // Download prebuilt libraries for release versions let archive_path = download_duckdb_lib_archive().unwrap(); diff --git a/vortex-duckdb/src/e2e_test/vortex_scan_test.rs b/vortex-duckdb/src/e2e_test/vortex_scan_test.rs index f2191fcbb6f..210921c0d5e 100644 --- a/vortex-duckdb/src/e2e_test/vortex_scan_test.rs +++ b/vortex-duckdb/src/e2e_test/vortex_scan_test.rs @@ -361,10 +361,8 @@ fn test_vortex_scan_over_http() { let listener = TcpListener::bind("127.0.0.1:0").unwrap(); let addr = listener.local_addr().unwrap(); - // Spawn 10 threads because DuckDB does HEAD and GET requests with retries, - // thus 2 threads, one for each implementation, aren't enough std::thread::spawn(move || { - for _ in 0..10 { + for _ in 0..2 { if let Ok((mut stream, _)) = listener.accept() { let response = format!( "HTTP/1.1 200 OK\r\nContent-Length: {}\r\n\r\n", @@ -378,30 +376,24 @@ fn test_vortex_scan_over_http() { let conn = database_connection(); conn.query("SET vortex_filesystem = 'duckdb';").unwrap(); - for httpfs_impl in ["httplib", "curl"] { - println!("Testing httpfs client implementation: {httpfs_impl}"); - conn.query(&format!( - "SET httpfs_client_implementation = '{httpfs_impl}';" - )) - .unwrap(); + conn.query("INSTALL httpfs;").unwrap(); + conn.query("LOAD httpfs;").unwrap(); - let url = format!( - "http://{}/{}", - addr, - file.path().file_name().unwrap().to_string_lossy() - ); - println!("url={url}, file={}", file.path().display()); + let url = format!( + "http://{}/{}", + addr, + file.path().file_name().unwrap().to_string_lossy() + ); - let result = conn - .query(&format!("SELECT COUNT(*) FROM read_vortex('{url}')")) - .unwrap(); - let chunk = result.into_iter().next().unwrap(); - let count = chunk - .get_vector(0) - .as_slice_with_len::(chunk.len().as_())[0]; + let result = conn + .query(&format!("SELECT COUNT(*) FROM read_vortex('{url}')")) + .unwrap(); + let chunk = result.into_iter().next().unwrap(); + let count = chunk + .get_vector(0) + .as_slice_with_len::(chunk.len().as_())[0]; - assert_eq!(count, 3); - } + assert_eq!(count, 3); } #[test] diff --git a/vortex-duckdb/src/filesystem.rs b/vortex-duckdb/src/filesystem.rs index 8240d490d4f..e4bb279a567 100644 --- a/vortex-duckdb/src/filesystem.rs +++ b/vortex-duckdb/src/filesystem.rs @@ -124,7 +124,7 @@ impl FileSystem for DuckDbFileSystem { fn list(&self, prefix: &str) -> BoxStream<'_, VortexResult> { let mut directory_url = self.base_url.clone(); if !prefix.is_empty() { - directory_url.set_path(prefix); + directory_url.set_path(&format!("{}/{}", directory_url.path(), prefix)); } // DuckDB's ListFiles expects bare paths for local files, but full URLs @@ -159,7 +159,7 @@ impl FileSystem for DuckDbFileSystem { async fn open_read(&self, path: &str) -> VortexResult> { let mut url = self.base_url.clone(); - url.set_path(path); + url.set_path(&format!("{}/{}", url.path(), path)); let reader = unsafe { DuckDbFsReader::open_url(self.ctx.as_ptr(), &url)? }; Ok(Arc::new(reader)) }