Skip to content

Commit 39656da

Browse files
committed
Fix HTTP 451 DMCA and 403 TOS handling regression (josegonzalez#487)
The DMCA handling added in PR josegonzalez#454 had a bug: make_request_with_retry() raises HTTPError before retrieve_data() could check the status code via getcode(), making the case 451 handler dead code. This also affected HTTP 403 TOS violations (e.g. jumoog/MagiskOnWSA). Fix by catching HTTPError in retrieve_data() and converting 451 and blocked 403 responses (identified by "block" key in response body) to RepositoryUnavailableError. Non-block 403s (permissions, scopes) still propagate as HTTPError. Also handle RepositoryUnavailableError in retrieve_repositories() for the --repository case. Rewrote tests to mock urlopen (not make_request_with_retry) to exercise the real code path that was previously untested. Closes josegonzalez#487
1 parent 8c1a134 commit 39656da

File tree

3 files changed

+228
-78
lines changed

3 files changed

+228
-78
lines changed

github_backup/github_backup.py

Lines changed: 75 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,11 @@
3939

4040

4141
class RepositoryUnavailableError(Exception):
42-
"""Raised when a repository is unavailable due to legal reasons (e.g., DMCA takedown)."""
42+
"""Raised when a repository is unavailable due to legal reasons (e.g., DMCA takedown, TOS violation)."""
4343

44-
def __init__(self, message, dmca_url=None):
44+
def __init__(self, message, legal_url=None):
4545
super().__init__(message)
46-
self.dmca_url = dmca_url
46+
self.legal_url = legal_url
4747

4848

4949
# Setup SSL context with fallback chain
@@ -647,6 +647,14 @@ def _extract_next_page_url(link_header):
647647
return None
648648

649649
def fetch_all() -> Generator[dict, None, None]:
650+
def _extract_legal_url(response_body_bytes):
651+
"""Extract DMCA/legal notice URL from GitHub API error response body."""
652+
try:
653+
data = json.loads(response_body_bytes.decode("utf-8"))
654+
return data.get("block", {}).get("html_url")
655+
except Exception:
656+
return None
657+
650658
next_url = None
651659

652660
while True:
@@ -661,47 +669,64 @@ def fetch_all() -> Generator[dict, None, None]:
661669
as_app=args.as_app,
662670
fine=args.token_fine is not None,
663671
)
664-
http_response = make_request_with_retry(request, auth, args.max_retries)
665-
666-
match http_response.getcode():
667-
case 200:
668-
# Success - Parse JSON response
669-
try:
670-
response = json.loads(http_response.read().decode("utf-8"))
671-
break # Exit retry loop and handle the data returned
672-
except (
673-
IncompleteRead,
674-
json.decoder.JSONDecodeError,
675-
TimeoutError,
676-
) as e:
677-
logger.warning(f"{type(e).__name__} reading response")
678-
if attempt < args.max_retries:
679-
delay = calculate_retry_delay(attempt, {})
680-
logger.warning(
681-
f"Retrying read in {delay:.1f}s (attempt {attempt + 1}/{args.max_retries + 1})"
682-
)
683-
time.sleep(delay)
684-
continue # Next retry attempt
685-
686-
case 451:
687-
# DMCA takedown - extract URL if available, then raise
688-
dmca_url = None
689-
try:
690-
response_data = json.loads(
691-
http_response.read().decode("utf-8")
692-
)
693-
dmca_url = response_data.get("block", {}).get("html_url")
694-
except Exception:
695-
pass
672+
try:
673+
http_response = make_request_with_retry(request, auth, args.max_retries)
674+
except HTTPError as exc:
675+
if exc.code == 451:
676+
legal_url = _extract_legal_url(exc.read())
696677
raise RepositoryUnavailableError(
697-
"Repository unavailable due to legal reasons (HTTP 451)",
698-
dmca_url=dmca_url,
678+
f"Repository unavailable due to legal reasons (HTTP {exc.code})",
679+
legal_url=legal_url,
699680
)
681+
elif exc.code == 403:
682+
# Rate-limit 403s (x-ratelimit-remaining=0) are retried
683+
# by make_request_with_retry — re-raise if exhausted.
684+
if int(exc.headers.get("x-ratelimit-remaining", 1)) < 1:
685+
raise
686+
# Only convert to RepositoryUnavailableError if GitHub
687+
# indicates a TOS/DMCA block (response contains "block"
688+
# key). Other 403s (permissions, scopes) should propagate.
689+
body = exc.read()
690+
try:
691+
data = json.loads(body.decode("utf-8"))
692+
except Exception:
693+
data = {}
694+
if "block" in data:
695+
raise RepositoryUnavailableError(
696+
"Repository access blocked (HTTP 403)",
697+
legal_url=data.get("block", {}).get("html_url"),
698+
)
699+
raise
700+
else:
701+
raise
702+
703+
# urlopen raises HTTPError for non-2xx, so only success gets here.
704+
# Guard against unexpected status codes from proxies, future Python
705+
# changes, or other edge cases we haven't considered.
706+
status = http_response.getcode()
707+
if status != 200:
708+
raise Exception(
709+
f"Unexpected HTTP {status} from {next_url or template} "
710+
f"(expected non-2xx to raise HTTPError)"
711+
)
700712

701-
case _:
702-
raise Exception(
703-
f"API request returned HTTP {http_response.getcode()}: {http_response.reason}"
713+
# Parse JSON response
714+
try:
715+
response = json.loads(http_response.read().decode("utf-8"))
716+
break # Exit retry loop and handle the data returned
717+
except (
718+
IncompleteRead,
719+
json.decoder.JSONDecodeError,
720+
TimeoutError,
721+
) as e:
722+
logger.warning(f"{type(e).__name__} reading response")
723+
if attempt < args.max_retries:
724+
delay = calculate_retry_delay(attempt, {})
725+
logger.warning(
726+
f"Retrying read in {delay:.1f}s (attempt {attempt + 1}/{args.max_retries + 1})"
704727
)
728+
time.sleep(delay)
729+
continue # Next retry attempt
705730
else:
706731
logger.error(
707732
f"Failed to read response after {args.max_retries + 1} attempts for {next_url or template}"
@@ -1614,7 +1639,13 @@ def retrieve_repositories(args, authenticated_user):
16141639
paginated = False
16151640
template = "https://{0}/repos/{1}".format(get_github_api_host(args), repo_path)
16161641

1617-
repos = retrieve_data(args, template, paginated=paginated)
1642+
try:
1643+
repos = retrieve_data(args, template, paginated=paginated)
1644+
except RepositoryUnavailableError as e:
1645+
logger.warning(f"Repository is unavailable: {e}")
1646+
if e.legal_url:
1647+
logger.warning(f"Legal notice: {e.legal_url}")
1648+
return []
16181649

16191650
if args.all_starred:
16201651
starred_template = "https://{0}/users/{1}/starred".format(
@@ -1833,10 +1864,10 @@ def backup_repositories(args, output_directory, repositories):
18331864
)
18341865
except RepositoryUnavailableError as e:
18351866
logger.warning(
1836-
f"Repository {repository['full_name']} is unavailable (HTTP 451)"
1867+
f"Repository {repository['full_name']} is unavailable: {e}"
18371868
)
1838-
if e.dmca_url:
1839-
logger.warning(f"DMCA notice: {e.dmca_url}")
1869+
if e.legal_url:
1870+
logger.warning(f"Legal notice: {e.legal_url}")
18401871
logger.info(f"Skipping remaining resources for {repository['full_name']}")
18411872
continue
18421873

tests/test_http_451.py

Lines changed: 131 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,35 @@
1-
"""Tests for HTTP 451 (DMCA takedown) handling."""
1+
"""Tests for HTTP 451 (DMCA takedown) and HTTP 403 (TOS) handling."""
22

3+
import io
34
import json
4-
from unittest.mock import Mock, patch
5+
from unittest.mock import patch
6+
from urllib.error import HTTPError
57

68
import pytest
79

810
from github_backup import github_backup
911

1012

13+
def _make_http_error(code, body_bytes, msg="Error", headers=None):
14+
"""Create an HTTPError with a readable body (like a real urllib response)."""
15+
if headers is None:
16+
headers = {"x-ratelimit-remaining": "5000"}
17+
return HTTPError(
18+
url="https://api.github.com/repos/test/repo",
19+
code=code,
20+
msg=msg,
21+
hdrs=headers,
22+
fp=io.BytesIO(body_bytes),
23+
)
24+
25+
1126
class TestHTTP451Exception:
1227
"""Test suite for HTTP 451 DMCA takedown exception handling."""
1328

1429
def test_repository_unavailable_error_raised(self, create_args):
1530
"""HTTP 451 should raise RepositoryUnavailableError with DMCA URL."""
1631
args = create_args()
1732

18-
mock_response = Mock()
19-
mock_response.getcode.return_value = 451
20-
2133
dmca_data = {
2234
"message": "Repository access blocked",
2335
"block": {
@@ -26,66 +38,151 @@ def test_repository_unavailable_error_raised(self, create_args):
2638
"html_url": "https://github.com/github/dmca/blob/master/2024/11/2024-11-04-source-code.md",
2739
},
2840
}
29-
mock_response.read.return_value = json.dumps(dmca_data).encode("utf-8")
30-
mock_response.headers = {"x-ratelimit-remaining": "5000"}
31-
mock_response.reason = "Unavailable For Legal Reasons"
32-
33-
with patch(
34-
"github_backup.github_backup.make_request_with_retry",
35-
return_value=mock_response,
36-
):
41+
body = json.dumps(dmca_data).encode("utf-8")
42+
43+
def mock_urlopen(*a, **kw):
44+
raise _make_http_error(451, body, msg="Unavailable For Legal Reasons")
45+
46+
with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen):
3747
with pytest.raises(github_backup.RepositoryUnavailableError) as exc_info:
3848
github_backup.retrieve_data(
3949
args, "https://api.github.com/repos/test/dmca/issues"
4050
)
4151

4252
assert (
43-
exc_info.value.dmca_url
53+
exc_info.value.legal_url
4454
== "https://github.com/github/dmca/blob/master/2024/11/2024-11-04-source-code.md"
4555
)
4656
assert "451" in str(exc_info.value)
4757

48-
def test_repository_unavailable_error_without_dmca_url(self, create_args):
58+
def test_repository_unavailable_error_without_legal_url(self, create_args):
4959
"""HTTP 451 without DMCA details should still raise exception."""
5060
args = create_args()
5161

52-
mock_response = Mock()
53-
mock_response.getcode.return_value = 451
54-
mock_response.read.return_value = b'{"message": "Blocked"}'
55-
mock_response.headers = {"x-ratelimit-remaining": "5000"}
56-
mock_response.reason = "Unavailable For Legal Reasons"
62+
def mock_urlopen(*a, **kw):
63+
raise _make_http_error(451, b'{"message": "Blocked"}')
5764

58-
with patch(
59-
"github_backup.github_backup.make_request_with_retry",
60-
return_value=mock_response,
61-
):
65+
with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen):
6266
with pytest.raises(github_backup.RepositoryUnavailableError) as exc_info:
6367
github_backup.retrieve_data(
6468
args, "https://api.github.com/repos/test/dmca/issues"
6569
)
6670

67-
assert exc_info.value.dmca_url is None
71+
assert exc_info.value.legal_url is None
6872
assert "451" in str(exc_info.value)
6973

7074
def test_repository_unavailable_error_with_malformed_json(self, create_args):
7175
"""HTTP 451 with malformed JSON should still raise exception."""
7276
args = create_args()
7377

74-
mock_response = Mock()
75-
mock_response.getcode.return_value = 451
76-
mock_response.read.return_value = b"invalid json {"
77-
mock_response.headers = {"x-ratelimit-remaining": "5000"}
78-
mock_response.reason = "Unavailable For Legal Reasons"
78+
def mock_urlopen(*a, **kw):
79+
raise _make_http_error(451, b"invalid json {")
7980

80-
with patch(
81-
"github_backup.github_backup.make_request_with_retry",
82-
return_value=mock_response,
83-
):
81+
with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen):
8482
with pytest.raises(github_backup.RepositoryUnavailableError):
8583
github_backup.retrieve_data(
8684
args, "https://api.github.com/repos/test/dmca/issues"
8785
)
8886

8987

88+
class TestHTTP403TOS:
89+
"""Test suite for HTTP 403 TOS violation handling."""
90+
91+
def test_403_tos_raises_repository_unavailable(self, create_args):
92+
"""HTTP 403 (non-rate-limit) should raise RepositoryUnavailableError."""
93+
args = create_args()
94+
95+
tos_data = {
96+
"message": "Repository access blocked",
97+
"block": {
98+
"reason": "tos",
99+
"html_url": "https://github.com/contact/tos-violation",
100+
},
101+
}
102+
body = json.dumps(tos_data).encode("utf-8")
103+
104+
def mock_urlopen(*a, **kw):
105+
raise _make_http_error(
106+
403, body, msg="Forbidden",
107+
headers={"x-ratelimit-remaining": "5000"},
108+
)
109+
110+
with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen):
111+
with pytest.raises(github_backup.RepositoryUnavailableError) as exc_info:
112+
github_backup.retrieve_data(
113+
args, "https://api.github.com/repos/test/blocked/issues"
114+
)
115+
116+
assert exc_info.value.legal_url == "https://github.com/contact/tos-violation"
117+
assert "403" in str(exc_info.value)
118+
119+
def test_403_permission_denied_not_converted(self, create_args):
120+
"""HTTP 403 without 'block' in body should propagate as HTTPError, not RepositoryUnavailableError."""
121+
args = create_args()
122+
123+
body = json.dumps({"message": "Must have admin rights to Repository."}).encode("utf-8")
124+
125+
def mock_urlopen(*a, **kw):
126+
raise _make_http_error(
127+
403, body, msg="Forbidden",
128+
headers={"x-ratelimit-remaining": "5000"},
129+
)
130+
131+
with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen):
132+
with pytest.raises(HTTPError) as exc_info:
133+
github_backup.retrieve_data(
134+
args, "https://api.github.com/repos/test/private/issues"
135+
)
136+
137+
assert exc_info.value.code == 403
138+
139+
def test_403_rate_limit_not_converted(self, create_args):
140+
"""HTTP 403 with rate limit exhausted should NOT become RepositoryUnavailableError."""
141+
args = create_args()
142+
143+
call_count = 0
144+
145+
def mock_urlopen(*a, **kw):
146+
nonlocal call_count
147+
call_count += 1
148+
raise _make_http_error(
149+
403, b'{"message": "rate limit"}', msg="Forbidden",
150+
headers={"x-ratelimit-remaining": "0"},
151+
)
152+
153+
with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen):
154+
with patch(
155+
"github_backup.github_backup.calculate_retry_delay", return_value=0
156+
):
157+
with pytest.raises(HTTPError) as exc_info:
158+
github_backup.retrieve_data(
159+
args, "https://api.github.com/repos/test/ratelimit/issues"
160+
)
161+
162+
assert exc_info.value.code == 403
163+
# Should have retried (not raised immediately as RepositoryUnavailableError)
164+
assert call_count > 1
165+
166+
167+
class TestRetrieveRepositoriesUnavailable:
168+
"""Test that retrieve_repositories handles RepositoryUnavailableError gracefully."""
169+
170+
def test_unavailable_repo_returns_empty_list(self, create_args):
171+
"""retrieve_repositories should return [] when the repo is unavailable."""
172+
args = create_args(repository="blocked-repo")
173+
174+
def mock_urlopen(*a, **kw):
175+
raise _make_http_error(
176+
451,
177+
json.dumps({"message": "Blocked", "block": {"html_url": "https://example.com/dmca"}}).encode("utf-8"),
178+
msg="Unavailable For Legal Reasons",
179+
)
180+
181+
with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen):
182+
repos = github_backup.retrieve_repositories(args, {"login": None})
183+
184+
assert repos == []
185+
186+
90187
if __name__ == "__main__":
91188
pytest.main([__file__, "-v"])

0 commit comments

Comments
 (0)