from os import path
spam_url = "http://spamassassin.apache.org/old/publiccorpus/20050311_spam_2.tar.bz2"
ham_url = "http://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham_2.tar.bz2"
spam_archive = path.basename(spam_url)
ham_archive = path.basename(ham_url)
#!rm $ham_archive $spam_archive
### Download ham and spam archives if we don't have them yet
![ ! -f $spam_archive ] && wget $spam_url
![ ! -f $ham_archive ] && wget $ham_url
!ls *.bz2
print(ham_archive, spam_archive)
!tar -tvf $spam_archive | head -10
Each email is in a separate file (with a cryptic name ;-)
## .tar(.bz2, etc.) support in Python standard library:
import tarfile
## email parsing, also Python standard library:
import email
def iterate_emails(tar_path):
tar = tarfile.open(tar_path)
emails = (f for f in tar if f.isfile())
for info in emails:
f = tar.extractfile(info)
## parse contents of compressed file into an Email-object:
msg = email.message_from_binary_file(f)
yield msg
f.close()
def generator():
for i in range(4):
yield i
generator()
next(generator())
list(generator())
iterate_emails(spam_archive)
next(iterate_emails(spam_archive))
msg = list(iterate_emails(spam_archive))[32]
print(msg.as_string()[:1000])
def print_highlight(s):
"""Print email with some Quoted-Printable escape characters highlighted."""
C = "\x1b["
HLC = "48;2;252;227;40m"
s = s.replace("=\n", C + HLC + "=\n" + C + "0m")
s = s.replace("=3D", C + HLC + "=3D" + C + "0m")
print(s)
print_highlight(msg.as_string()[900:1300])
The emails are in RFC2045 "Quoted-Printable" encoding -- among other things, lines are wrapped at 76 characters, and equals signs are escaped. Python's `email` library can un-wrap text lines and do other MIME decoding.
print(msg.get_payload(decode=True).decode('utf-8')[:500])
There are several inconsistencies and edge cases in the email data. In the following, we provide a parsing function that can handle those:
def mail_text(msg):
headers = []
for k, v in msg.items():
headers.append(k)
if type(v) is str:
headers.append(v)
text_parts = (p for p in msg.walk()
if p.get_content_type().startswith('text'))
contents = []
for txt in text_parts:
charset = txt.get_content_charset()
try:
## decode MIME encoding
payload = txt.get_payload(decode=True)
try:
payload = payload.decode(charset)
except:
## if charset from header doesn't work, force UTF-8
payload = payload.decode('utf-8', 'replace')
contents.append(payload)
except:
contents.append(txt.get_payload())
return " ".join(headers + contents)
msg = next(iterate_emails(spam_archive))
print(mail_text(msg)[:1500])
spam = [mail_text(msg) for msg in iterate_emails(spam_archive)]
ham = [mail_text(msg) for msg in iterate_emails(ham_archive)]
print(len(spam), len(ham))
test_string = """Enjoy this special offer!
Buy the super-viagra today for only $99!!
The offer is only available today!
"""
import re ## Python standard regex library
token = re.compile(r"[\w'$-]+")
tokens = token.findall(test_string.lower())
print(tokens)
from collections import Counter
c = Counter()
print(tokens)
c.update(tokens)
print(c)