Skip to content

Commit b5925e0

Browse files
committed
Fix email header wrapping omitting white space
1 parent cfeede8 commit b5925e0

File tree

2 files changed

+64
-32
lines changed

2 files changed

+64
-32
lines changed

Lib/email/_header_value_parser.py

Lines changed: 46 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -2835,6 +2835,30 @@ def _steal_trailing_WSP_if_exists(lines):
28352835
lines.pop()
28362836
return wsp
28372837

2838+
def _steal_all_trailing_WSP_if_exists(lines):
2839+
lines_popped = False
2840+
wsp_lines = []
2841+
while lines and lines[-1]:
2842+
for i in range(len(lines[-1]), -1, -1):
2843+
if i <= 0:
2844+
break
2845+
if lines[-1][i - 1] not in WSP:
2846+
break
2847+
wsp_line = lines[-1][i:]
2848+
if not wsp_line:
2849+
break
2850+
wsp_lines.insert(0, wsp_line)
2851+
lines[-1] = lines[-1][:i]
2852+
if not lines[-1]:
2853+
lines_popped = True
2854+
lines.pop()
2855+
else:
2856+
break
2857+
2858+
if lines_popped:
2859+
lines.append(' ')
2860+
return ''.join(wsp_lines)
2861+
28382862
def _refold_parse_tree(parse_tree, *, policy):
28392863
"""Return string of contents of parse_tree folded according to RFC rules.
28402864
@@ -2843,9 +2867,7 @@ def _refold_parse_tree(parse_tree, *, policy):
28432867
maxlen = policy.max_line_length or sys.maxsize
28442868
encoding = 'utf-8' if policy.utf8 else 'us-ascii'
28452869
lines = [''] # Folded lines to be output
2846-
leading_whitespace = '' # When we have whitespace between two encoded
2847-
# words, we may need to encode the whitespace
2848-
# at the beginning of the second word.
2870+
last_word_is_ew = False
28492871
last_ew = None # Points to the last encoded character if there's an ew on
28502872
# the line
28512873
last_charset = None
@@ -2882,6 +2904,7 @@ def _refold_parse_tree(parse_tree, *, policy):
28822904
if part.token_type == 'mime-parameters':
28832905
# Mime parameter folding (using RFC2231) is extra special.
28842906
_fold_mime_parameters(part, lines, maxlen, encoding)
2907+
last_word_is_ew = False
28852908
continue
28862909

28872910
if want_encoding and not wrap_as_ew_blocked:
@@ -2898,6 +2921,7 @@ def _refold_parse_tree(parse_tree, *, policy):
28982921
# XXX what if encoded_part has no leading FWS?
28992922
lines.append(newline)
29002923
lines[-1] += encoded_part
2924+
last_word_is_ew = False
29012925
continue
29022926
# Either this is not a major syntactic break, so we don't
29032927
# want it on a line by itself even if it fits, or it
@@ -2917,10 +2941,8 @@ def _refold_parse_tree(parse_tree, *, policy):
29172941
last_charset == 'utf-8' and charset != 'us-ascii')):
29182942
last_ew = None
29192943
last_ew = _fold_as_ew(tstr, lines, maxlen, last_ew,
2920-
part.ew_combine_allowed, charset, leading_whitespace)
2921-
# This whitespace has been added to the lines in _fold_as_ew()
2922-
# so clear it now.
2923-
leading_whitespace = ''
2944+
part.ew_combine_allowed, charset, last_word_is_ew)
2945+
last_word_is_ew = True
29242946
last_charset = charset
29252947
want_encoding = False
29262948
continue
@@ -2933,28 +2955,20 @@ def _refold_parse_tree(parse_tree, *, policy):
29332955

29342956
if len(tstr) <= maxlen - len(lines[-1]):
29352957
lines[-1] += tstr
2958+
if any(char not in WSP for char in tstr):
2959+
last_word_is_ew = False
29362960
continue
29372961

29382962
# This part is too long to fit. The RFC wants us to break at
29392963
# "major syntactic breaks", so unless we don't consider this
29402964
# to be one, check if it will fit on the next line by itself.
2941-
leading_whitespace = ''
29422965
if (part.syntactic_break and
29432966
len(tstr) + 1 <= maxlen):
29442967
newline = _steal_trailing_WSP_if_exists(lines)
29452968
if newline or part.startswith_fws():
2946-
# We're going to fold the data onto a new line here. Due to
2947-
# the way encoded strings handle continuation lines, we need to
2948-
# be prepared to encode any whitespace if the next line turns
2949-
# out to start with an encoded word.
29502969
lines.append(newline + tstr)
2951-
2952-
whitespace_accumulator = []
2953-
for char in lines[-1]:
2954-
if char not in WSP:
2955-
break
2956-
whitespace_accumulator.append(char)
2957-
leading_whitespace = ''.join(whitespace_accumulator)
2970+
if not all(char in WSP for char in lines[-1]):
2971+
last_word_is_ew = False
29582972
last_ew = None
29592973
continue
29602974
if not hasattr(part, 'encode'):
@@ -2994,10 +3008,12 @@ def _refold_parse_tree(parse_tree, *, policy):
29943008
else:
29953009
# We can't fold it onto the next line either...
29963010
lines[-1] += tstr
3011+
if any(char not in WSP for char in tstr):
3012+
last_word_is_ew = False
29973013

29983014
return policy.linesep.join(lines) + policy.linesep
29993015

3000-
def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset, leading_whitespace):
3016+
def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset, last_word_is_ew):
30013017
"""Fold string to_encode into lines as encoded word, combining if allowed.
30023018
Return the new value for last_ew, or None if ew_combine_allowed is False.
30033019
@@ -3012,14 +3028,22 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset,
30123028
to_encode = str(
30133029
get_unstructured(lines[-1][last_ew:] + to_encode))
30143030
lines[-1] = lines[-1][:last_ew]
3015-
elif to_encode[0] in WSP:
3031+
elif to_encode[0] in WSP and not last_word_is_ew:
30163032
# We're joining this to non-encoded text, so don't encode
30173033
# the leading blank.
30183034
leading_wsp = to_encode[0]
30193035
to_encode = to_encode[1:]
30203036
if (len(lines[-1]) == maxlen):
30213037
lines.append(_steal_trailing_WSP_if_exists(lines))
30223038
lines[-1] += leading_wsp
3039+
elif last_word_is_ew:
3040+
# If we are following up an encoded word with another encoded word,
3041+
# any white space between the two will be ignored when decoded.
3042+
# Therefore, we encode all to-be-displayed whitespace in the second
3043+
# encoded word.
3044+
leading_whitespace = _steal_all_trailing_WSP_if_exists(lines)
3045+
to_encode = leading_whitespace + to_encode
3046+
lines[-1] = ' '
30233047

30243048
trailing_wsp = ''
30253049
if to_encode[-1] in WSP:
@@ -3040,20 +3064,11 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset,
30403064

30413065
while to_encode:
30423066
remaining_space = maxlen - len(lines[-1])
3043-
text_space = remaining_space - chrome_len - len(leading_whitespace)
3067+
text_space = remaining_space - chrome_len
30443068
if text_space <= 0:
30453069
lines.append(' ')
30463070
continue
30473071

3048-
# If we are at the start of a continuation line, prepend whitespace
3049-
# (we only want to do this when the line starts with an encoded word
3050-
# but if we're folding in this helper function, then we know that we
3051-
# are going to be writing out an encoded word.)
3052-
if len(lines) > 1 and len(lines[-1]) == 1 and leading_whitespace:
3053-
encoded_word = _ew.encode(leading_whitespace, charset=encode_as)
3054-
lines[-1] += encoded_word
3055-
leading_whitespace = ''
3056-
30573072
to_encode_word = to_encode[:text_space]
30583073
encoded_word = _ew.encode(to_encode_word, charset=encode_as)
30593074
excess = len(encoded_word) - remaining_space
@@ -3065,7 +3080,6 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset,
30653080
excess = len(encoded_word) - remaining_space
30663081
lines[-1] += encoded_word
30673082
to_encode = to_encode[len(to_encode_word):]
3068-
leading_whitespace = ''
30693083

30703084
if to_encode:
30713085
lines.append(' ')

Lib/test/test_email/test_generator.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -393,6 +393,24 @@ def test_defaults_handle_spaces_at_start_of_continuation_line(self):
393393
g.flatten(msg)
394394
self.assertEqual(s.getvalue(), expected)
395395

396+
# gh-144156
397+
# https://github.com/python/cpython/issues/144156
398+
def test_defaults_handle_spaces_at_start_of_continuation_line_2(self):
399+
source = ("Re: [SOS-1495488] Commande et livraison - Demande de retour - "
400+
"bibijolie - 251210-AABBCC - Abo actualités digitales 20 semaines "
401+
"d’abonnement à 24 heures, Bilan, Tribune de Genève et tous les titres Tamedia")
402+
expected = (b"Subject: "
403+
b"Re: [SOS-1495488] Commande et livraison - Demande de retour -\n"
404+
b" bibijolie - 251210-AABBCC - Abo =?utf-8?q?actualit=C3=A9s?= digitales 20\n"
405+
b" semaines =?utf-8?q?d=E2=80=99abonnement_=C3=A0?= 24 heures, Bilan, Tribune de\n"
406+
b" =?utf-8?q?Gen=C3=A8ve?= et tous les titres Tamedia\n\n")
407+
msg = EmailMessage()
408+
msg['Subject'] = source
409+
s = io.BytesIO()
410+
g = BytesGenerator(s)
411+
g.flatten(msg)
412+
self.assertEqual(s.getvalue(), expected)
413+
396414
def test_cte_type_7bit_handles_unknown_8bit(self):
397415
source = ("Subject: Maintenant je vous présente mon "
398416
"collègue\n\n").encode('utf-8')

0 commit comments

Comments
 (0)