diff --git a/rag/app/email.py b/rag/app/email.py index 1affe4f25a4..2e95eceecbc 100644 --- a/rag/app/email.py +++ b/rag/app/email.py @@ -62,14 +62,27 @@ def chunk( # get the email main info def _add_content(msg, content_type): + def _decode_payload(payload, charset, target_list): + try: + target_list.append(payload.decode(charset)) + except (UnicodeDecodeError, LookupError): + for enc in ["utf-8", "gb2312", "gbk", "gb18030", "latin1"]: + try: + target_list.append(payload.decode(enc)) + break + except UnicodeDecodeError: + continue + else: + target_list.append(payload.decode("utf-8", errors="ignore")) + if content_type == "text/plain": - text_txt.append( - msg.get_payload(decode=True).decode(msg.get_content_charset()) - ) + payload = msg.get_payload(decode=True) + charset = msg.get_content_charset() or "utf-8" + _decode_payload(payload, charset, text_txt) elif content_type == "text/html": - html_txt.append( - msg.get_payload(decode=True).decode(msg.get_content_charset()) - ) + payload = msg.get_payload(decode=True) + charset = msg.get_content_charset() or "utf-8" + _decode_payload(payload, charset, html_txt) elif "multipart" in content_type: if msg.is_multipart(): for part in msg.iter_parts():