-
-
Notifications
You must be signed in to change notification settings - Fork 34.1k
gh-144156: Fix email header folding concatenating encoded words #144692
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
b5925e0
1a7824e
fb23a6e
050491c
ed6f197
91b4b3b
d5f5001
15e6618
902ec59
a1cfe3c
45022be
ccf399d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -80,7 +80,8 @@ | |||||||||||||
| # Useful constants and functions | ||||||||||||||
| # | ||||||||||||||
|
|
||||||||||||||
| WSP = set(' \t') | ||||||||||||||
| _WSP = ' \t' | ||||||||||||||
| WSP = set(_WSP) | ||||||||||||||
| CFWS_LEADER = WSP | set('(') | ||||||||||||||
| SPECIALS = set(r'()<>@,:;.\"[]') | ||||||||||||||
| ATOM_ENDS = SPECIALS | WSP | ||||||||||||||
|
|
@@ -2835,6 +2836,13 @@ def _steal_trailing_WSP_if_exists(lines): | |||||||||||||
| lines.pop() | ||||||||||||||
| return wsp | ||||||||||||||
|
|
||||||||||||||
|
|
||||||||||||||
| def _last_word_is_sill_ew(_last_word_is_ew, added_str): | ||||||||||||||
| # If the last word is an encoded word, and the added string is all WSP, | ||||||||||||||
| # then (and only then) is the last word is still an encoded word. | ||||||||||||||
| return _last_word_is_ew and not bool(added_str.strip(_WSP)) | ||||||||||||||
|
|
||||||||||||||
|
|
||||||||||||||
| def _refold_parse_tree(parse_tree, *, policy): | ||||||||||||||
| """Return string of contents of parse_tree folded according to RFC rules. | ||||||||||||||
|
|
||||||||||||||
|
|
@@ -2843,11 +2851,9 @@ def _refold_parse_tree(parse_tree, *, policy): | |||||||||||||
| maxlen = policy.max_line_length or sys.maxsize | ||||||||||||||
| encoding = 'utf-8' if policy.utf8 else 'us-ascii' | ||||||||||||||
| lines = [''] # Folded lines to be output | ||||||||||||||
| leading_whitespace = '' # When we have whitespace between two encoded | ||||||||||||||
| # words, we may need to encode the whitespace | ||||||||||||||
| # at the beginning of the second word. | ||||||||||||||
| last_ew = None # Points to the last encoded character if there's an ew on | ||||||||||||||
| # the line | ||||||||||||||
| last_word_is_ew = False | ||||||||||||||
| last_ew = None # if there is an encoded word in the last line of lines, | ||||||||||||||
| # points to the encoded word's first character | ||||||||||||||
| last_charset = None | ||||||||||||||
| wrap_as_ew_blocked = 0 | ||||||||||||||
| want_encoding = False # This is set to True if we need to encode this part | ||||||||||||||
|
|
@@ -2882,6 +2888,7 @@ def _refold_parse_tree(parse_tree, *, policy): | |||||||||||||
| if part.token_type == 'mime-parameters': | ||||||||||||||
| # Mime parameter folding (using RFC2231) is extra special. | ||||||||||||||
| _fold_mime_parameters(part, lines, maxlen, encoding) | ||||||||||||||
| last_word_is_ew = False | ||||||||||||||
| continue | ||||||||||||||
|
|
||||||||||||||
| if want_encoding and not wrap_as_ew_blocked: | ||||||||||||||
|
|
@@ -2898,6 +2905,7 @@ def _refold_parse_tree(parse_tree, *, policy): | |||||||||||||
| # XXX what if encoded_part has no leading FWS? | ||||||||||||||
| lines.append(newline) | ||||||||||||||
| lines[-1] += encoded_part | ||||||||||||||
| last_word_is_ew = False | ||||||||||||||
| continue | ||||||||||||||
| # Either this is not a major syntactic break, so we don't | ||||||||||||||
| # want it on a line by itself even if it fits, or it | ||||||||||||||
|
|
@@ -2916,11 +2924,16 @@ def _refold_parse_tree(parse_tree, *, policy): | |||||||||||||
| (last_charset == 'unknown-8bit' or | ||||||||||||||
| last_charset == 'utf-8' and charset != 'us-ascii')): | ||||||||||||||
| last_ew = None | ||||||||||||||
| last_ew = _fold_as_ew(tstr, lines, maxlen, last_ew, | ||||||||||||||
| part.ew_combine_allowed, charset, leading_whitespace) | ||||||||||||||
| # This whitespace has been added to the lines in _fold_as_ew() | ||||||||||||||
| # so clear it now. | ||||||||||||||
| leading_whitespace = '' | ||||||||||||||
| last_ew = _fold_as_ew( | ||||||||||||||
| tstr, | ||||||||||||||
| lines, | ||||||||||||||
| maxlen, | ||||||||||||||
| last_ew, | ||||||||||||||
| part.ew_combine_allowed, | ||||||||||||||
| charset, | ||||||||||||||
| last_word_is_ew, | ||||||||||||||
| ) | ||||||||||||||
| last_word_is_ew = True | ||||||||||||||
| last_charset = charset | ||||||||||||||
| want_encoding = False | ||||||||||||||
| continue | ||||||||||||||
|
|
@@ -2933,28 +2946,20 @@ def _refold_parse_tree(parse_tree, *, policy): | |||||||||||||
|
|
||||||||||||||
| if len(tstr) <= maxlen - len(lines[-1]): | ||||||||||||||
| lines[-1] += tstr | ||||||||||||||
| last_word_is_ew = _last_word_is_sill_ew(last_word_is_ew, tstr) | ||||||||||||||
| continue | ||||||||||||||
|
|
||||||||||||||
| # This part is too long to fit. The RFC wants us to break at | ||||||||||||||
| # "major syntactic breaks", so unless we don't consider this | ||||||||||||||
| # to be one, check if it will fit on the next line by itself. | ||||||||||||||
| leading_whitespace = '' | ||||||||||||||
| if (part.syntactic_break and | ||||||||||||||
| len(tstr) + 1 <= maxlen): | ||||||||||||||
| newline = _steal_trailing_WSP_if_exists(lines) | ||||||||||||||
| if newline or part.startswith_fws(): | ||||||||||||||
| # We're going to fold the data onto a new line here. Due to | ||||||||||||||
| # the way encoded strings handle continuation lines, we need to | ||||||||||||||
| # be prepared to encode any whitespace if the next line turns | ||||||||||||||
| # out to start with an encoded word. | ||||||||||||||
| lines.append(newline + tstr) | ||||||||||||||
|
|
||||||||||||||
| whitespace_accumulator = [] | ||||||||||||||
| for char in lines[-1]: | ||||||||||||||
| if char not in WSP: | ||||||||||||||
| break | ||||||||||||||
| whitespace_accumulator.append(char) | ||||||||||||||
| leading_whitespace = ''.join(whitespace_accumulator) | ||||||||||||||
| last_word_is_ew = _last_word_is_sill_ew( | ||||||||||||||
| last_word_is_ew, lines[-1] | ||||||||||||||
| ) | ||||||||||||||
| last_ew = None | ||||||||||||||
| continue | ||||||||||||||
| if not hasattr(part, 'encode'): | ||||||||||||||
|
|
@@ -2994,10 +2999,11 @@ def _refold_parse_tree(parse_tree, *, policy): | |||||||||||||
| else: | ||||||||||||||
| # We can't fold it onto the next line either... | ||||||||||||||
| lines[-1] += tstr | ||||||||||||||
| last_word_is_ew = _last_word_is_sill_ew(last_word_is_ew, tstr) | ||||||||||||||
|
|
||||||||||||||
| return policy.linesep.join(lines) + policy.linesep | ||||||||||||||
|
|
||||||||||||||
| def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset, leading_whitespace): | ||||||||||||||
| def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset, last_word_is_ew): | ||||||||||||||
| """Fold string to_encode into lines as encoded word, combining if allowed. | ||||||||||||||
| Return the new value for last_ew, or None if ew_combine_allowed is False. | ||||||||||||||
|
|
||||||||||||||
|
|
@@ -3012,14 +3018,24 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset, | |||||||||||||
| to_encode = str( | ||||||||||||||
| get_unstructured(lines[-1][last_ew:] + to_encode)) | ||||||||||||||
| lines[-1] = lines[-1][:last_ew] | ||||||||||||||
| elif to_encode[0] in WSP: | ||||||||||||||
| elif to_encode[0] in WSP and not last_word_is_ew: | ||||||||||||||
| # We're joining this to non-encoded text, so don't encode | ||||||||||||||
| # the leading blank. | ||||||||||||||
| leading_wsp = to_encode[0] | ||||||||||||||
| to_encode = to_encode[1:] | ||||||||||||||
| if (len(lines[-1]) == maxlen): | ||||||||||||||
| lines.append(_steal_trailing_WSP_if_exists(lines)) | ||||||||||||||
| lines[-1] += leading_wsp | ||||||||||||||
| elif last_word_is_ew: | ||||||||||||||
| # If we are following up an encoded word with another encoded word, | ||||||||||||||
| # any white space between the two will be ignored when decoded. | ||||||||||||||
| # Therefore, we encode all to-be-displayed whitespace in the second | ||||||||||||||
| # encoded word. | ||||||||||||||
| len_without_wsp = len(lines[-1].rstrip(_WSP)) | ||||||||||||||
| leading_whitespace = lines[-1][len_without_wsp:] | ||||||||||||||
| lines[-1] = (lines[-1][:len_without_wsp] | ||||||||||||||
| + (' ' if leading_whitespace else '')) | ||||||||||||||
| to_encode = leading_whitespace + to_encode | ||||||||||||||
robsdedude marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||||||
|
|
||||||||||||||
| trailing_wsp = '' | ||||||||||||||
| if to_encode[-1] in WSP: | ||||||||||||||
|
|
@@ -3040,20 +3056,13 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset, | |||||||||||||
|
|
||||||||||||||
| while to_encode: | ||||||||||||||
| remaining_space = maxlen - len(lines[-1]) | ||||||||||||||
| text_space = remaining_space - chrome_len - len(leading_whitespace) | ||||||||||||||
| text_space = remaining_space - chrome_len | ||||||||||||||
| if text_space <= 0: | ||||||||||||||
| lines.append(' ') | ||||||||||||||
| newline = _steal_trailing_WSP_if_exists(lines) | ||||||||||||||
| lines.append(newline or ' ') | ||||||||||||||
| new_last_ew = len(lines[-1]) | ||||||||||||||
| continue | ||||||||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. One of my new tests revealed a pre-existing problem here:
Suggested change
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This suggestion does not seem to fix the test cases
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm. Maybe there's a bug in your steal all whitespace function? Using my suggestion the tests pass for me, but I see you didn't make that change, at least in what I can see. On the other hand, github also doesn't show the fix applied, so I'm not sure where you are at, exactly. However, I also see that I failed to run the full test suite before posting the review, and there's a whitespace change in three existing tests. I think it's a bugfix, but I'll have to look closer when I get a chance later today.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, I only had the change applied locally. I think I found the issue. The suggestion here looks like the With the suggestion applied correctly to my local working tree I now see the same: your new tests pass, 3 old tests started failing. I went ahead and adjusted the test expectations. Let me know what you think once you've had a chance to have a closer look.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah that suggestion looks borked. I don't see any way to dismiss it, unfortunately. The test expectation changes are correct, that extra space after Subject was a bug. I may even have opened an issue about it, but if so I can't find it. Probably just made a "note to self" that I've since lost. |
||||||||||||||
|
|
||||||||||||||
| # If we are at the start of a continuation line, prepend whitespace | ||||||||||||||
| # (we only want to do this when the line starts with an encoded word | ||||||||||||||
| # but if we're folding in this helper function, then we know that we | ||||||||||||||
| # are going to be writing out an encoded word.) | ||||||||||||||
| if len(lines) > 1 and len(lines[-1]) == 1 and leading_whitespace: | ||||||||||||||
| encoded_word = _ew.encode(leading_whitespace, charset=encode_as) | ||||||||||||||
| lines[-1] += encoded_word | ||||||||||||||
| leading_whitespace = '' | ||||||||||||||
|
|
||||||||||||||
| to_encode_word = to_encode[:text_space] | ||||||||||||||
| encoded_word = _ew.encode(to_encode_word, charset=encode_as) | ||||||||||||||
| excess = len(encoded_word) - remaining_space | ||||||||||||||
|
|
@@ -3065,7 +3074,6 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset, | |||||||||||||
| excess = len(encoded_word) - remaining_space | ||||||||||||||
| lines[-1] += encoded_word | ||||||||||||||
| to_encode = to_encode[len(to_encode_word):] | ||||||||||||||
| leading_whitespace = '' | ||||||||||||||
|
|
||||||||||||||
| if to_encode: | ||||||||||||||
| lines.append(' ') | ||||||||||||||
|
|
||||||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| Fix the folding of headers by the :mod:`email` library when :rfc:`2047` encoded words are used. Now whitespace is correctly preserved and also correctly added between adjacent encoded words. The latter property was broken by the fix for gh-92081, which mostly fixed previous failures to preserve whitespace. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I find this name a bit confusing. What it actually does is "return true if last_word_is_ew is true and added_str consists of whitespace only". I can't think of a name that is shorter and as descriptive as the actual code. So I'm not in favor of this function.