Skip to content

Commit 655886f

Browse files
authored
Merge pull request josegonzalez#488 from Iamrodos/fix/487-dmca-regression
Fix HTTP 451 DMCA and 403 TOS handling regression (josegonzalez#487)
2 parents 8c1a134 + 0162f7e commit 655886f

File tree

3 files changed

+245
-80
lines changed

3 files changed

+245
-80
lines changed

github_backup/github_backup.py

Lines changed: 77 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,11 @@
3939

4040

4141
class RepositoryUnavailableError(Exception):
42-
"""Raised when a repository is unavailable due to legal reasons (e.g., DMCA takedown)."""
42+
"""Raised when a repository is unavailable due to legal reasons (e.g., DMCA takedown, TOS violation)."""
4343

44-
def __init__(self, message, dmca_url=None):
44+
def __init__(self, message, legal_url=None):
4545
super().__init__(message)
46-
self.dmca_url = dmca_url
46+
self.legal_url = legal_url
4747

4848

4949
# Setup SSL context with fallback chain
@@ -647,6 +647,14 @@ def _extract_next_page_url(link_header):
647647
return None
648648

649649
def fetch_all() -> Generator[dict, None, None]:
650+
def _extract_legal_url(response_body_bytes):
651+
"""Extract DMCA/legal notice URL from GitHub API error response body."""
652+
try:
653+
data = json.loads(response_body_bytes.decode("utf-8"))
654+
return data.get("block", {}).get("html_url")
655+
except Exception:
656+
return None
657+
650658
next_url = None
651659

652660
while True:
@@ -661,47 +669,66 @@ def fetch_all() -> Generator[dict, None, None]:
661669
as_app=args.as_app,
662670
fine=args.token_fine is not None,
663671
)
664-
http_response = make_request_with_retry(request, auth, args.max_retries)
665-
666-
match http_response.getcode():
667-
case 200:
668-
# Success - Parse JSON response
669-
try:
670-
response = json.loads(http_response.read().decode("utf-8"))
671-
break # Exit retry loop and handle the data returned
672-
except (
673-
IncompleteRead,
674-
json.decoder.JSONDecodeError,
675-
TimeoutError,
676-
) as e:
677-
logger.warning(f"{type(e).__name__} reading response")
678-
if attempt < args.max_retries:
679-
delay = calculate_retry_delay(attempt, {})
680-
logger.warning(
681-
f"Retrying read in {delay:.1f}s (attempt {attempt + 1}/{args.max_retries + 1})"
682-
)
683-
time.sleep(delay)
684-
continue # Next retry attempt
685-
686-
case 451:
687-
# DMCA takedown - extract URL if available, then raise
688-
dmca_url = None
689-
try:
690-
response_data = json.loads(
691-
http_response.read().decode("utf-8")
692-
)
693-
dmca_url = response_data.get("block", {}).get("html_url")
694-
except Exception:
695-
pass
672+
try:
673+
http_response = make_request_with_retry(
674+
request, auth, args.max_retries
675+
)
676+
except HTTPError as exc:
677+
if exc.code == 451:
678+
legal_url = _extract_legal_url(exc.read())
696679
raise RepositoryUnavailableError(
697-
"Repository unavailable due to legal reasons (HTTP 451)",
698-
dmca_url=dmca_url,
680+
f"Repository unavailable due to legal reasons (HTTP {exc.code})",
681+
legal_url=legal_url,
699682
)
683+
elif exc.code == 403:
684+
# Rate-limit 403s (x-ratelimit-remaining=0) are retried
685+
# by make_request_with_retry — re-raise if exhausted.
686+
if int(exc.headers.get("x-ratelimit-remaining", 1)) < 1:
687+
raise
688+
# Only convert to RepositoryUnavailableError if GitHub
689+
# indicates a TOS/DMCA block (response contains "block"
690+
# key). Other 403s (permissions, scopes) should propagate.
691+
body = exc.read()
692+
try:
693+
data = json.loads(body.decode("utf-8"))
694+
except Exception:
695+
data = {}
696+
if "block" in data:
697+
raise RepositoryUnavailableError(
698+
"Repository access blocked (HTTP 403)",
699+
legal_url=data.get("block", {}).get("html_url"),
700+
)
701+
raise
702+
else:
703+
raise
704+
705+
# urlopen raises HTTPError for non-2xx, so only success gets here.
706+
# Guard against unexpected status codes from proxies, future Python
707+
# changes, or other edge cases we haven't considered.
708+
status = http_response.getcode()
709+
if status != 200:
710+
raise Exception(
711+
f"Unexpected HTTP {status} from {next_url or template} "
712+
f"(expected non-2xx to raise HTTPError)"
713+
)
700714

701-
case _:
702-
raise Exception(
703-
f"API request returned HTTP {http_response.getcode()}: {http_response.reason}"
715+
# Parse JSON response
716+
try:
717+
response = json.loads(http_response.read().decode("utf-8"))
718+
break # Exit retry loop and handle the data returned
719+
except (
720+
IncompleteRead,
721+
json.decoder.JSONDecodeError,
722+
TimeoutError,
723+
) as e:
724+
logger.warning(f"{type(e).__name__} reading response")
725+
if attempt < args.max_retries:
726+
delay = calculate_retry_delay(attempt, {})
727+
logger.warning(
728+
f"Retrying read in {delay:.1f}s (attempt {attempt + 1}/{args.max_retries + 1})"
704729
)
730+
time.sleep(delay)
731+
continue # Next retry attempt
705732
else:
706733
logger.error(
707734
f"Failed to read response after {args.max_retries + 1} attempts for {next_url or template}"
@@ -1614,7 +1641,13 @@ def retrieve_repositories(args, authenticated_user):
16141641
paginated = False
16151642
template = "https://{0}/repos/{1}".format(get_github_api_host(args), repo_path)
16161643

1617-
repos = retrieve_data(args, template, paginated=paginated)
1644+
try:
1645+
repos = retrieve_data(args, template, paginated=paginated)
1646+
except RepositoryUnavailableError as e:
1647+
logger.warning(f"Repository is unavailable: {e}")
1648+
if e.legal_url:
1649+
logger.warning(f"Legal notice: {e.legal_url}")
1650+
return []
16181651

16191652
if args.all_starred:
16201653
starred_template = "https://{0}/users/{1}/starred".format(
@@ -1832,11 +1865,9 @@ def backup_repositories(args, output_directory, repositories):
18321865
include_assets=args.include_assets or args.include_everything,
18331866
)
18341867
except RepositoryUnavailableError as e:
1835-
logger.warning(
1836-
f"Repository {repository['full_name']} is unavailable (HTTP 451)"
1837-
)
1838-
if e.dmca_url:
1839-
logger.warning(f"DMCA notice: {e.dmca_url}")
1868+
logger.warning(f"Repository {repository['full_name']} is unavailable: {e}")
1869+
if e.legal_url:
1870+
logger.warning(f"Legal notice: {e.legal_url}")
18401871
logger.info(f"Skipping remaining resources for {repository['full_name']}")
18411872
continue
18421873

0 commit comments

Comments
 (0)