Fix email header wrapping omitting white space

robsdedude · robsdedude · commit b5925e056e5e · 2026-02-10T22:15:06.000+01:00
diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py
@@ -2835,6 +2835,30 @@ def _steal_trailing_WSP_if_exists(lines):
             lines.pop()
     return wsp
 
+def _steal_all_trailing_WSP_if_exists(lines):
+    lines_popped = False
+    wsp_lines = []
+    while lines and lines[-1]:
+        for i in range(len(lines[-1]), -1, -1):
+            if i <= 0:
+                break
+            if lines[-1][i - 1] not in WSP:
+                break
+        wsp_line = lines[-1][i:]
+        if not wsp_line:
+            break
+        wsp_lines.insert(0, wsp_line)
+        lines[-1] = lines[-1][:i]
+        if not lines[-1]:
+            lines_popped = True
+            lines.pop()
+        else:
+            break
+
+    if lines_popped:
+        lines.append(' ')
+    return ''.join(wsp_lines)
+
 def _refold_parse_tree(parse_tree, *, policy):
     """Return string of contents of parse_tree folded according to RFC rules.
 
@@ -2843,9 +2867,7 @@ def _refold_parse_tree(parse_tree, *, policy):
     maxlen = policy.max_line_length or sys.maxsize
     encoding = 'utf-8' if policy.utf8 else 'us-ascii'
     lines = ['']  # Folded lines to be output
-    leading_whitespace = ''  # When we have whitespace between two encoded
-                             # words, we may need to encode the whitespace
-                             # at the beginning of the second word.
+    last_word_is_ew = False
     last_ew = None  # Points to the last encoded character if there's an ew on
                     # the line
     last_charset = None
@@ -2882,6 +2904,7 @@ def _refold_parse_tree(parse_tree, *, policy):
         if part.token_type == 'mime-parameters':
             # Mime parameter folding (using RFC2231) is extra special.
             _fold_mime_parameters(part, lines, maxlen, encoding)
+            last_word_is_ew = False
             continue
 
         if want_encoding and not wrap_as_ew_blocked:
@@ -2898,6 +2921,7 @@ def _refold_parse_tree(parse_tree, *, policy):
                             # XXX what if encoded_part has no leading FWS?
                             lines.append(newline)
                         lines[-1] += encoded_part
+                        last_word_is_ew = False
                         continue
                 # Either this is not a major syntactic break, so we don't
                 # want it on a line by itself even if it fits, or it
@@ -2917,10 +2941,8 @@ def _refold_parse_tree(parse_tree, *, policy):
                      last_charset == 'utf-8' and charset != 'us-ascii')):
                     last_ew = None
                 last_ew = _fold_as_ew(tstr, lines, maxlen, last_ew,
-                                      part.ew_combine_allowed, charset, leading_whitespace)
-                # This whitespace has been added to the lines in _fold_as_ew()
-                # so clear it now.
-                leading_whitespace = ''
+                                      part.ew_combine_allowed, charset, last_word_is_ew)
+                last_word_is_ew = True
                 last_charset = charset
                 want_encoding = False
                 continue
@@ -2933,28 +2955,20 @@ def _refold_parse_tree(parse_tree, *, policy):
 
         if len(tstr) <= maxlen - len(lines[-1]):
             lines[-1] += tstr
+            if any(char not in WSP for char in tstr):
+                last_word_is_ew = False
             continue
 
         # This part is too long to fit.  The RFC wants us to break at
         # "major syntactic breaks", so unless we don't consider this
         # to be one, check if it will fit on the next line by itself.
-        leading_whitespace = ''
         if (part.syntactic_break and
                 len(tstr) + 1 <= maxlen):
             newline = _steal_trailing_WSP_if_exists(lines)
             if newline or part.startswith_fws():
-                # We're going to fold the data onto a new line here.  Due to
-                # the way encoded strings handle continuation lines, we need to
-                # be prepared to encode any whitespace if the next line turns
-                # out to start with an encoded word.
                 lines.append(newline + tstr)
-
-                whitespace_accumulator = []
-                for char in lines[-1]:
-                    if char not in WSP:
-                        break
-                    whitespace_accumulator.append(char)
-                leading_whitespace = ''.join(whitespace_accumulator)
+                if not all(char in WSP for char in lines[-1]):
+                    last_word_is_ew = False
                 last_ew = None
                 continue
         if not hasattr(part, 'encode'):
@@ -2994,10 +3008,12 @@ def _refold_parse_tree(parse_tree, *, policy):
         else:
             # We can't fold it onto the next line either...
             lines[-1] += tstr
+        if any(char not in WSP for char in tstr):
+            last_word_is_ew = False
 
     return policy.linesep.join(lines) + policy.linesep
 
-def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset, leading_whitespace):
+def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset, last_word_is_ew):
     """Fold string to_encode into lines as encoded word, combining if allowed.
     Return the new value for last_ew, or None if ew_combine_allowed is False.
 
@@ -3012,14 +3028,22 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset,
         to_encode = str(
             get_unstructured(lines[-1][last_ew:] + to_encode))
         lines[-1] = lines[-1][:last_ew]
-    elif to_encode[0] in WSP:
+    elif to_encode[0] in WSP and not last_word_is_ew:
         # We're joining this to non-encoded text, so don't encode
         # the leading blank.
         leading_wsp = to_encode[0]
         to_encode = to_encode[1:]
         if (len(lines[-1]) == maxlen):
             lines.append(_steal_trailing_WSP_if_exists(lines))
         lines[-1] += leading_wsp
+    elif last_word_is_ew:
+        # If we are following up an encoded word with another encoded word,
+        # any white space between the two will be ignored when decoded.
+        # Therefore, we encode all to-be-displayed whitespace in the second
+        # encoded word.
+        leading_whitespace = _steal_all_trailing_WSP_if_exists(lines)
+        to_encode = leading_whitespace + to_encode
+        lines[-1] = ' '
 
     trailing_wsp = ''
     if to_encode[-1] in WSP:
@@ -3040,20 +3064,11 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset,
 
     while to_encode:
         remaining_space = maxlen - len(lines[-1])
-        text_space = remaining_space - chrome_len - len(leading_whitespace)
+        text_space = remaining_space - chrome_len
         if text_space <= 0:
             lines.append(' ')
             continue
 
-        # If we are at the start of a continuation line, prepend whitespace
-        # (we only want to do this when the line starts with an encoded word
-        # but if we're folding in this helper function, then we know that we
-        # are going to be writing out an encoded word.)
-        if len(lines) > 1 and len(lines[-1]) == 1 and leading_whitespace:
-            encoded_word = _ew.encode(leading_whitespace, charset=encode_as)
-            lines[-1] += encoded_word
-            leading_whitespace = ''
-
         to_encode_word = to_encode[:text_space]
         encoded_word = _ew.encode(to_encode_word, charset=encode_as)
         excess = len(encoded_word) - remaining_space
@@ -3065,7 +3080,6 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset,
             excess = len(encoded_word) - remaining_space
         lines[-1] += encoded_word
         to_encode = to_encode[len(to_encode_word):]
-        leading_whitespace = ''
 
         if to_encode:
             lines.append(' ')
diff --git a/Lib/test/test_email/test_generator.py b/Lib/test/test_email/test_generator.py
@@ -393,6 +393,24 @@ def test_defaults_handle_spaces_at_start_of_continuation_line(self):
         g.flatten(msg)
         self.assertEqual(s.getvalue(), expected)
 
+    # gh-144156
+    # https://github.com/python/cpython/issues/144156
+    def test_defaults_handle_spaces_at_start_of_continuation_line_2(self):
+        source = ("Re: [SOS-1495488] Commande et livraison - Demande de retour - "
+                  "bibijolie - 251210-AABBCC - Abo actualités digitales 20 semaines "
+                  "d’abonnement à 24 heures, Bilan, Tribune de Genève et tous les titres Tamedia")
+        expected = (b"Subject: "
+                    b"Re: [SOS-1495488] Commande et livraison - Demande de retour -\n"
+                    b" bibijolie - 251210-AABBCC - Abo =?utf-8?q?actualit=C3=A9s?= digitales 20\n"
+                    b" semaines =?utf-8?q?d=E2=80=99abonnement_=C3=A0?= 24 heures, Bilan, Tribune de\n"
+                    b" =?utf-8?q?Gen=C3=A8ve?= et tous les titres Tamedia\n\n")
+        msg = EmailMessage()
+        msg['Subject'] = source
+        s = io.BytesIO()
+        g = BytesGenerator(s)
+        g.flatten(msg)
+        self.assertEqual(s.getvalue(), expected)
+
     def test_cte_type_7bit_handles_unknown_8bit(self):
         source = ("Subject: Maintenant je vous présente mon "
                  "collègue\n\n").encode('utf-8')