diff --git a/package.json b/package.json index 9cf0df6e8..5cc4fe628 100644 --- a/package.json +++ b/package.json @@ -92,8 +92,8 @@ "escape-string-regexp": "~5.0.0", "execa": "^9.5.2", "express": "~5.2.0", - "globby": "~15.0.0", "handlebars": "~4.7.8", + "ignore": "^5.3.2", "indent-string": "^5.0.0", "is-ci": "~4.1.0", "istextorbinary": "~9.5.0", @@ -107,6 +107,7 @@ "string-width": "^8.0.0", "strip-ansi": "^7.1.0", "tiged": "~2.12.7", + "tinyglobby": "^0.2.15", "which": "^6.0.0", "widest-line": "^6.0.0", "wrap-ansi": "^10.0.0" diff --git a/src/lib/utils.ts b/src/lib/utils.ts index 99bb775c8..e5425426e 100644 --- a/src/lib/utils.ts +++ b/src/lib/utils.ts @@ -1,9 +1,10 @@ +import { execSync } from 'node:child_process'; import { createWriteStream, existsSync, mkdirSync, readdirSync, readFileSync, writeFileSync } from 'node:fs'; import { mkdir, readFile } from 'node:fs/promises'; import type { IncomingMessage } from 'node:http'; import { get } from 'node:https'; import { homedir } from 'node:os'; -import { dirname, join } from 'node:path'; +import { dirname, join, relative } from 'node:path'; import process from 'node:process'; import { finished } from 'node:stream/promises'; @@ -15,12 +16,13 @@ import { type ActorRun, ApifyClient, type ApifyClientOptions, type Build } from import archiver from 'archiver'; import { AxiosHeaders } from 'axios'; import escapeStringRegexp from 'escape-string-regexp'; -import { globby } from 'globby'; +import ignoreModule, { type Ignore } from 'ignore'; import { getEncoding } from 'istextorbinary'; import { Mime } from 'mime'; import otherMimes from 'mime/types/other.js'; import standardMimes from 'mime/types/standard.js'; import { gte, minVersion, satisfies } from 'semver'; +import { escapePath, glob } from 'tinyglobby'; import { ACTOR_ENV_VARS, @@ -134,8 +136,7 @@ const getTokenWithAuthFileFallback = (existingToken?: string) => { return existingToken; }; -// biome-ignore format: off -type CJSAxiosHeaders = import('axios', { with: { 'resolution-mode': 'require' } }).AxiosRequestConfig['headers']; +type CJSAxiosHeaders = import('axios', { with: { 'resolution-mode': 'require' }}).AxiosRequestConfig['headers']; /** * Returns options for ApifyClient @@ -231,9 +232,13 @@ export const setLocalEnv = async (actDir: string) => { if (gitignoreAdditions.length > 0) { if (gitignoreContents.length > 0) { gitignoreAdditions.unshift('# Added by Apify CLI'); - writeFileSync(gitignorePath, `\n${gitignoreAdditions.join('\n')}\n`, { flag: 'a' }); + writeFileSync(gitignorePath, `\n${gitignoreAdditions.join('\n')}\n`, { + flag: 'a', + }); } else { - writeFileSync(gitignorePath, `${gitignoreAdditions.join('\n')}\n`, { flag: 'w' }); + writeFileSync(gitignorePath, `${gitignoreAdditions.join('\n')}\n`, { + flag: 'w', + }); } } }; @@ -285,18 +290,121 @@ export const createSourceFiles = async (paths: string[], cwd: string) => { }); }; +/** + * Fallback for when git is unavailable: find all .gitignore files and build a filter + * using the `ignore` package, scoped to each file's directory. + * Also walks ancestor directories to pick up parent .gitignore files (e.g. monorepo root), + * stopping at the first .git boundary found. + */ +const getGitignoreFallbackFilter = async (cwd: string): Promise<(paths: string[]) => string[]> => { + const gitignoreFiles = await glob('**/.gitignore', { + dot: true, + cwd, + ignore: ['.git/**'], + expandDirectories: false, + }); + + // `ignore` is a CJS package; TypeScript sees its default import as the module + // object rather than the callable factory, so we cast through unknown. + const makeIg = ignoreModule as unknown as () => Ignore; + + const filters: { dir: string; ig: Ignore; ancestorPrefix?: string }[] = []; + + for (const gitignoreFile of gitignoreFiles) { + const gitignoreDir = dirname(gitignoreFile); // e.g. 'src' or '.' + const content = await readFile(join(cwd, gitignoreFile), 'utf-8'); + filters.push({ dir: gitignoreDir === '.' ? '' : gitignoreDir, ig: makeIg().add(content) }); + } + + // Walk ancestor directories to pick up parent .gitignore files (e.g. monorepo root). + // Check for a .git boundary FIRST so we stop before processing the git root's own + // .gitignore — that file is handled by `git ls-files` when git is available, and + // avoids accidentally applying rules from an unrelated outer repository. + let parentDir = dirname(cwd); + while (parentDir !== dirname(parentDir)) { + if (existsSync(join(parentDir, '.git'))) { + break; + } + + const parentGitignorePath = join(parentDir, '.gitignore'); + if (existsSync(parentGitignorePath)) { + try { + const content = await readFile(parentGitignorePath, 'utf-8'); + // Paths passed to this filter are relative to cwd. To test them against + // a .gitignore that lives above cwd we need to prepend the relative path + // from the ancestor dir to cwd so the ignore patterns see the right scope. + const ancestorPrefix = relative(parentDir, cwd); + filters.push({ dir: '', ig: makeIg().add(content), ancestorPrefix }); + } catch { + // Ignore read errors + } + } + + parentDir = dirname(parentDir); + } + + if (filters.length === 0) { + return (paths) => paths; + } + + return (paths) => + paths.filter((filePath) => { + for (const { dir, ig, ancestorPrefix } of filters) { + let relativePath: string | null; + if (!dir) { + relativePath = ancestorPrefix ? `${ancestorPrefix}/${filePath}` : filePath; + } else if (filePath.startsWith(`${dir}/`)) { + relativePath = filePath.slice(dir.length + 1); + } else { + relativePath = null; + } + if (relativePath !== null && ig.ignores(relativePath)) { + return false; + } + } + return true; + }); +}; + /** * Get Actor local files, omit files defined in .gitignore and .git folder * All dot files(.file) and folders(.folder/) are included. */ -export const getActorLocalFilePaths = async (cwd?: string) => - globby(['*', '**/**'], { - ignore: ['.git/**', 'apify_storage', 'node_modules', 'storage', 'crawlee_storage'], - gitignore: true, +export const getActorLocalFilePaths = async (cwd?: string) => { + const resolvedCwd = cwd ?? process.cwd(); + + const ignore = ['.git/**', 'apify_storage', 'node_modules', 'storage', 'crawlee_storage']; + + let fallbackFilter: ((paths: string[]) => string[]) | null = null; + + // Use git ls-files to get gitignored paths — this correctly handles ancestor .gitignore files, + // nested .gitignore files, .git/info/exclude, and global gitignore config + try { + const gitIgnored = execSync('git ls-files --others --ignored --exclude-standard --directory', { + cwd: resolvedCwd, + encoding: 'utf-8', + stdio: ['ignore', 'pipe', 'ignore'], + }) + .split('\n') + .filter(Boolean) + .map((p) => escapePath(p)); + + ignore.push(...gitIgnored); + } catch { + // git is unavailable or directory is not a git repo — fall back to parsing .gitignore files + fallbackFilter = await getGitignoreFallbackFilter(resolvedCwd); + } + + const paths = await glob(['*', '**/**'], { + ignore, dot: true, - cwd, + expandDirectories: false, + cwd: resolvedCwd, }); + return fallbackFilter ? fallbackFilter(paths) : paths; +}; + /** * Create zip file with all Actor files specified with pathsToZip */ @@ -444,7 +552,7 @@ export const getNpmCmd = (): string => { * Returns true if apify storage is empty (expect INPUT.*) */ export const checkIfStorageIsEmpty = async () => { - const filesWithoutInput = await globby([ + const filesWithoutInput = await glob([ `${getLocalStorageDir()}/**`, // Omit INPUT.* file `!${getLocalKeyValueStorePath()}/${KEY_VALUE_STORE_KEYS.INPUT}.*`, diff --git a/test/local/lib/utils-gitignore-fallback.test.ts b/test/local/lib/utils-gitignore-fallback.test.ts new file mode 100644 index 000000000..aa12d8aef --- /dev/null +++ b/test/local/lib/utils-gitignore-fallback.test.ts @@ -0,0 +1,162 @@ +import { mkdirSync, writeFileSync } from 'node:fs'; +import { join } from 'node:path'; + +import { ensureFolderExistsSync } from '../../../src/lib/files.js'; +import { getActorLocalFilePaths } from '../../../src/lib/utils.js'; +import { useTempPath } from '../../__setup__/hooks/useTempPath.js'; + +// Mock execSync to simulate git not being available. +// vi.mock is hoisted before imports, so utils.ts gets the mocked version. +vi.mock('node:child_process', async (importOriginal) => { + const original = await importOriginal(); + return { + ...original, + execSync: () => { + throw new Error('not a git repository'); + }, + }; +}); + +const TEST_DIR = 'gitignore-fallback-test-dir'; +const FOLDERS = ['src', 'src/utils']; +const FOLDERS_TO_IGNORE = ['dist', 'src/generated']; +const FILES = ['main.js', 'src/index.js', 'src/utils/helper.js']; +const FILES_IN_IGNORED_DIR = ['dist/bundle.js', 'src/generated/types.js']; +const FILES_TO_IGNORE = ['debug.log']; + +describe('Utils - gitignore fallback (no git)', () => { + const { tmpPath, joinPath, beforeAllCalls, afterAllCalls } = useTempPath(TEST_DIR, { + create: true, + remove: true, + cwd: false, + cwdParent: false, + }); + + beforeAll(async () => { + await beforeAllCalls(); + + // NOTE: No git init here — execSync is mocked to throw, triggering the fallback path. + + FOLDERS.concat(FOLDERS_TO_IGNORE).forEach((folder) => { + ensureFolderExistsSync(tmpPath, folder); + }); + + FILES.concat(FILES_TO_IGNORE, FILES_IN_IGNORED_DIR).forEach((file) => + writeFileSync(joinPath(file), 'content', { flag: 'w' }), + ); + + const toIgnore = FOLDERS_TO_IGNORE.concat(FILES_TO_IGNORE).join('\n'); + writeFileSync(joinPath('.gitignore'), toIgnore, { flag: 'w' }); + }); + + afterAll(async () => { + await afterAllCalls(); + }); + + it('should exclude files listed in .gitignore when git is unavailable', async () => { + const paths = await getActorLocalFilePaths(tmpPath); + + FILES.forEach((file) => expect(paths).toContain(file)); + FILES_IN_IGNORED_DIR.concat(FILES_TO_IGNORE).forEach((file) => expect(paths).not.toContain(file)); + }); +}); + +const NESTED_TEST_DIR = 'gitignore-nested-test-dir'; + +describe('Utils - nested .gitignore scoping (no git)', () => { + const { tmpPath, joinPath, beforeAllCalls, afterAllCalls } = useTempPath(NESTED_TEST_DIR, { + create: true, + remove: true, + cwd: false, + cwdParent: false, + }); + + beforeAll(async () => { + await beforeAllCalls(); + + // Create directory structure + ensureFolderExistsSync(tmpPath, 'src'); + ensureFolderExistsSync(tmpPath, 'src/internal'); + + // Create files: one public, one that should be scoped-ignored by src/.gitignore + writeFileSync(joinPath('src/public.js'), 'content', { flag: 'w' }); + writeFileSync(joinPath('src/internal/secret.js'), 'content', { flag: 'w' }); + + // Only a nested .gitignore — the root has no entry for src/internal + writeFileSync(joinPath('src/.gitignore'), 'internal/', { flag: 'w' }); + }); + + afterAll(async () => { + await afterAllCalls(); + }); + + it('should exclude files matched by a nested .gitignore scoped to its own directory', async () => { + const paths = await getActorLocalFilePaths(tmpPath); + + // src/public.js should be present + expect(paths).toContain('src/public.js'); + + // src/internal/secret.js should be excluded by src/.gitignore's `internal/` rule + expect(paths).not.toContain('src/internal/secret.js'); + }); +}); + +const PARENT_TEST_DIR = 'gitignore-parent-test-dir'; + +describe('Utils - parent .gitignore applied to subproject (no git)', () => { + // tmpPath is the "project root" that holds the parent .gitignore. + // The actual cwd passed to getActorLocalFilePaths is tmpPath/subproject/. + const { tmpPath, beforeAllCalls, afterAllCalls } = useTempPath(PARENT_TEST_DIR, { + create: true, + remove: true, + cwd: false, + cwdParent: false, + }); + + let subprojectPath: string; + + beforeAll(async () => { + await beforeAllCalls(); + + subprojectPath = join(tmpPath, 'subproject'); + + // Parent .gitignore — rules that should apply to everything inside subproject/. + // No fake .git is needed: the ancestor-walker already stops at the apify-cli + // repo root (.git lives there) before touching its own .gitignore. + writeFileSync(join(tmpPath, '.gitignore'), '*.secret\nbuild/\n', { flag: 'w' }); + + // Subproject directory structure + mkdirSync(subprojectPath, { recursive: true }); + ensureFolderExistsSync(subprojectPath, 'src'); + ensureFolderExistsSync(subprojectPath, 'build'); + + // Files that should be kept + writeFileSync(join(subprojectPath, 'main.js'), 'content', { flag: 'w' }); + writeFileSync(join(subprojectPath, 'src', 'utils.js'), 'content', { flag: 'w' }); + + // Files/dirs that should be excluded by parent .gitignore + writeFileSync(join(subprojectPath, 'config.secret'), 'content', { flag: 'w' }); + writeFileSync(join(subprojectPath, 'src', 'db.secret'), 'content', { flag: 'w' }); + writeFileSync(join(subprojectPath, 'build', 'output.js'), 'content', { flag: 'w' }); + }); + + afterAll(async () => { + await afterAllCalls(); + }); + + it('should exclude files matched by *.secret pattern in parent .gitignore', async () => { + const paths = await getActorLocalFilePaths(subprojectPath); + + expect(paths).toContain('main.js'); + expect(paths).toContain('src/utils.js'); + + expect(paths).not.toContain('config.secret'); + expect(paths).not.toContain('src/db.secret'); + }); + + it('should exclude directory matched by build/ pattern in parent .gitignore', async () => { + const paths = await getActorLocalFilePaths(subprojectPath); + + expect(paths).not.toContain('build/output.js'); + }); +}); diff --git a/test/local/lib/utils.test.ts b/test/local/lib/utils.test.ts index 4d8c5f548..ab43f9d7b 100644 --- a/test/local/lib/utils.test.ts +++ b/test/local/lib/utils.test.ts @@ -35,12 +35,18 @@ describe('Utils', () => { beforeAll(async () => { await beforeAllCalls(); + // Initialize a fresh git repo so the local .gitignore is parsed independently + // from the parent repo (which gitignores test/tmp entirely) + await execWithLog({ cmd: 'git', args: ['init'], opts: { cwd: tmpPath } }); + FOLDERS.concat(FOLDERS_TO_IGNORE).forEach((folder) => { ensureFolderExistsSync(tmpPath, folder); }); FILES.concat(FILES_TO_IGNORE, FILES_IN_IGNORED_DIR).forEach((file) => - writeFileSync(joinPath(file), Math.random().toString(36).substring(7), { flag: 'w' }), + writeFileSync(joinPath(file), Math.random().toString(36).substring(7), { + flag: 'w', + }), ); const toIgnore = FOLDERS_TO_IGNORE.concat(FILES_TO_IGNORE).join('\n'); diff --git a/yarn.lock b/yarn.lock index a1f80c36f..f7bd7eaa6 100644 --- a/yarn.lock +++ b/yarn.lock @@ -2298,8 +2298,8 @@ __metadata: eslint-config-prettier: "npm:^10.1.2" execa: "npm:^9.5.2" express: "npm:~5.2.0" - globby: "npm:~15.0.0" handlebars: "npm:~4.7.8" + ignore: "npm:^5.3.2" indent-string: "npm:^5.0.0" is-ci: "npm:~4.1.0" istextorbinary: "npm:~9.5.0" @@ -2316,6 +2316,7 @@ __metadata: string-width: "npm:^8.0.0" strip-ansi: "npm:^7.1.0" tiged: "npm:~2.12.7" + tinyglobby: "npm:^0.2.15" tsup: "npm:^8.5.0" tsx: "npm:^4.16.5" typescript: "npm:^5.8.3" @@ -4377,7 +4378,7 @@ __metadata: languageName: node linkType: hard -"fast-glob@npm:^3.2.2, fast-glob@npm:^3.3.3": +"fast-glob@npm:^3.2.2": version: 3.3.3 resolution: "fast-glob@npm:3.3.3" dependencies: @@ -4944,20 +4945,6 @@ __metadata: languageName: node linkType: hard -"globby@npm:~15.0.0": - version: 15.0.0 - resolution: "globby@npm:15.0.0" - dependencies: - "@sindresorhus/merge-streams": "npm:^4.0.0" - fast-glob: "npm:^3.3.3" - ignore: "npm:^7.0.5" - path-type: "npm:^6.0.0" - slash: "npm:^5.1.0" - unicorn-magic: "npm:^0.3.0" - checksum: 10c0/e4107be0579bcdd9642b8dff86aeafeaf62b2b9dd116669ab6e02e0e0c07ada0d972c2db182dee7588b460fe8c8919ddcc6b1cc4db405ca3a2adc9d35fa6eb21 - languageName: node - linkType: hard - "globrex@npm:^0.1.2": version: 0.1.2 resolution: "globrex@npm:0.1.2" @@ -5257,7 +5244,7 @@ __metadata: languageName: node linkType: hard -"ignore@npm:^5.2.0": +"ignore@npm:^5.2.0, ignore@npm:^5.3.2": version: 5.3.2 resolution: "ignore@npm:5.3.2" checksum: 10c0/f9f652c957983634ded1e7f02da3b559a0d4cc210fca3792cb67f1b153623c9c42efdc1c4121af171e295444459fc4a9201101fb041b1104a3c000bccb188337 @@ -6993,13 +6980,6 @@ __metadata: languageName: node linkType: hard -"path-type@npm:^6.0.0": - version: 6.0.0 - resolution: "path-type@npm:6.0.0" - checksum: 10c0/55baa8b1187d6dc683d5a9cfcc866168d6adff58e5db91126795376d818eee46391e00b2a4d53e44d844c7524a7d96aa68cc68f4f3e500d3d069a39e6535481c - languageName: node - linkType: hard - "pathe@npm:^2.0.1, pathe@npm:^2.0.3": version: 2.0.3 resolution: "pathe@npm:2.0.3" @@ -7941,13 +7921,6 @@ __metadata: languageName: node linkType: hard -"slash@npm:^5.1.0": - version: 5.1.0 - resolution: "slash@npm:5.1.0" - checksum: 10c0/eb48b815caf0bdc390d0519d41b9e0556a14380f6799c72ba35caf03544d501d18befdeeef074bc9c052acf69654bc9e0d79d7f1de0866284137a40805299eb3 - languageName: node - linkType: hard - "slice-ansi@npm:^7.1.0": version: 7.1.2 resolution: "slice-ansi@npm:7.1.2"