3939
4040
4141class RepositoryUnavailableError (Exception ):
42- """Raised when a repository is unavailable due to legal reasons (e.g., DMCA takedown)."""
42+ """Raised when a repository is unavailable due to legal reasons (e.g., DMCA takedown, TOS violation )."""
4343
44- def __init__ (self , message , dmca_url = None ):
44+ def __init__ (self , message , legal_url = None ):
4545 super ().__init__ (message )
46- self .dmca_url = dmca_url
46+ self .legal_url = legal_url
4747
4848
4949# Setup SSL context with fallback chain
@@ -647,6 +647,14 @@ def _extract_next_page_url(link_header):
647647 return None
648648
649649 def fetch_all () -> Generator [dict , None , None ]:
650+ def _extract_legal_url (response_body_bytes ):
651+ """Extract DMCA/legal notice URL from GitHub API error response body."""
652+ try :
653+ data = json .loads (response_body_bytes .decode ("utf-8" ))
654+ return data .get ("block" , {}).get ("html_url" )
655+ except Exception :
656+ return None
657+
650658 next_url = None
651659
652660 while True :
@@ -661,47 +669,66 @@ def fetch_all() -> Generator[dict, None, None]:
661669 as_app = args .as_app ,
662670 fine = args .token_fine is not None ,
663671 )
664- http_response = make_request_with_retry (request , auth , args .max_retries )
665-
666- match http_response .getcode ():
667- case 200 :
668- # Success - Parse JSON response
669- try :
670- response = json .loads (http_response .read ().decode ("utf-8" ))
671- break # Exit retry loop and handle the data returned
672- except (
673- IncompleteRead ,
674- json .decoder .JSONDecodeError ,
675- TimeoutError ,
676- ) as e :
677- logger .warning (f"{ type (e ).__name__ } reading response" )
678- if attempt < args .max_retries :
679- delay = calculate_retry_delay (attempt , {})
680- logger .warning (
681- f"Retrying read in { delay :.1f} s (attempt { attempt + 1 } /{ args .max_retries + 1 } )"
682- )
683- time .sleep (delay )
684- continue # Next retry attempt
685-
686- case 451 :
687- # DMCA takedown - extract URL if available, then raise
688- dmca_url = None
689- try :
690- response_data = json .loads (
691- http_response .read ().decode ("utf-8" )
692- )
693- dmca_url = response_data .get ("block" , {}).get ("html_url" )
694- except Exception :
695- pass
672+ try :
673+ http_response = make_request_with_retry (
674+ request , auth , args .max_retries
675+ )
676+ except HTTPError as exc :
677+ if exc .code == 451 :
678+ legal_url = _extract_legal_url (exc .read ())
696679 raise RepositoryUnavailableError (
697- "Repository unavailable due to legal reasons (HTTP 451 )" ,
698- dmca_url = dmca_url ,
680+ f "Repository unavailable due to legal reasons (HTTP { exc . code } )" ,
681+ legal_url = legal_url ,
699682 )
683+ elif exc .code == 403 :
684+ # Rate-limit 403s (x-ratelimit-remaining=0) are retried
685+ # by make_request_with_retry — re-raise if exhausted.
686+ if int (exc .headers .get ("x-ratelimit-remaining" , 1 )) < 1 :
687+ raise
688+ # Only convert to RepositoryUnavailableError if GitHub
689+ # indicates a TOS/DMCA block (response contains "block"
690+ # key). Other 403s (permissions, scopes) should propagate.
691+ body = exc .read ()
692+ try :
693+ data = json .loads (body .decode ("utf-8" ))
694+ except Exception :
695+ data = {}
696+ if "block" in data :
697+ raise RepositoryUnavailableError (
698+ "Repository access blocked (HTTP 403)" ,
699+ legal_url = data .get ("block" , {}).get ("html_url" ),
700+ )
701+ raise
702+ else :
703+ raise
704+
705+ # urlopen raises HTTPError for non-2xx, so only success gets here.
706+ # Guard against unexpected status codes from proxies, future Python
707+ # changes, or other edge cases we haven't considered.
708+ status = http_response .getcode ()
709+ if status != 200 :
710+ raise Exception (
711+ f"Unexpected HTTP { status } from { next_url or template } "
712+ f"(expected non-2xx to raise HTTPError)"
713+ )
700714
701- case _:
702- raise Exception (
703- f"API request returned HTTP { http_response .getcode ()} : { http_response .reason } "
715+ # Parse JSON response
716+ try :
717+ response = json .loads (http_response .read ().decode ("utf-8" ))
718+ break # Exit retry loop and handle the data returned
719+ except (
720+ IncompleteRead ,
721+ json .decoder .JSONDecodeError ,
722+ TimeoutError ,
723+ ) as e :
724+ logger .warning (f"{ type (e ).__name__ } reading response" )
725+ if attempt < args .max_retries :
726+ delay = calculate_retry_delay (attempt , {})
727+ logger .warning (
728+ f"Retrying read in { delay :.1f} s (attempt { attempt + 1 } /{ args .max_retries + 1 } )"
704729 )
730+ time .sleep (delay )
731+ continue # Next retry attempt
705732 else :
706733 logger .error (
707734 f"Failed to read response after { args .max_retries + 1 } attempts for { next_url or template } "
@@ -1614,7 +1641,13 @@ def retrieve_repositories(args, authenticated_user):
16141641 paginated = False
16151642 template = "https://{0}/repos/{1}" .format (get_github_api_host (args ), repo_path )
16161643
1617- repos = retrieve_data (args , template , paginated = paginated )
1644+ try :
1645+ repos = retrieve_data (args , template , paginated = paginated )
1646+ except RepositoryUnavailableError as e :
1647+ logger .warning (f"Repository is unavailable: { e } " )
1648+ if e .legal_url :
1649+ logger .warning (f"Legal notice: { e .legal_url } " )
1650+ return []
16181651
16191652 if args .all_starred :
16201653 starred_template = "https://{0}/users/{1}/starred" .format (
@@ -1832,11 +1865,9 @@ def backup_repositories(args, output_directory, repositories):
18321865 include_assets = args .include_assets or args .include_everything ,
18331866 )
18341867 except RepositoryUnavailableError as e :
1835- logger .warning (
1836- f"Repository { repository ['full_name' ]} is unavailable (HTTP 451)"
1837- )
1838- if e .dmca_url :
1839- logger .warning (f"DMCA notice: { e .dmca_url } " )
1868+ logger .warning (f"Repository { repository ['full_name' ]} is unavailable: { e } " )
1869+ if e .legal_url :
1870+ logger .warning (f"Legal notice: { e .legal_url } " )
18401871 logger .info (f"Skipping remaining resources for { repository ['full_name' ]} " )
18411872 continue
18421873
0 commit comments