On a similar line I actually wrote a little Python script to forward the old emails from the group using an IMAP/SMTP combo. In theory it was nice:
- I could detect the intended category of quite a lot of the messages based on their content and direct them to the appropriate discourse mailbox
- doesn’t need access the docker container or change anything directly on the server
- the date of the post is used correctly (i.e. we can post in the past!) unlike the googlegroup scraper(?)
Unfortunately I ran into a few problems with email rejections and bounces; I guess it looked too much like the forum was being spammed by new users and I didn’t have time (yet) to go through the settings to work out how to make discourse temporarily relaxed enough to let in all the posts from new staged users.
In case anyone wants to play with it:
from __future__ import unicode_literals
from datetime import date
import imaplib
import smtplib
import email
import getpass
import time
sent = []
testing = False
#from
HOST = 'imap.googlemail.com'
USERNAME = 'user.name'
PASSWORD = getpass.getpass()
ssl = True
fromDate = "01-July-2016"
toDate = "10-July-2016"
toSearch = [ # search these mailboxes with associated terms
{'mbox':'read_mail', 'term':'TO "psychopy-users@googlegroups.com"'},
{'mbox':'psychopy-dev', 'term':'TO "psychopy-dev@googlegroups.com"'},
]
# smtp server
# DON't USE smtp.gmail as it converts the From: field to be that account
outbox = smtplib.SMTP('smtp.server.address:port')
# imap server
client = imaplib.IMAP4_SSL(HOST)
client.login(USERNAME, PASSWORD)
# print client.list() # show valid mailboxes
# some helper functions to search for terms
def is_(msg, terms, notTerms):
guess = False
terms = ['routine','flow','builder','graphical']
notTerms = ['coder'] # indicates ambiguous text
#look for hopeful terms
for thisTerm in terms:
try:
if thisTerm in msg:
guess = True
break
except: # will fail if encoding error
return False
# then vito with notTerms
for thisTerm in notTerms:
if thisTerm in msg:
return False
return guess
def isBuilder(msg):
terms = ['routine','flow','builder','graphical']
notTerms = ['coder'] # indicates ambiguous text
guess = is_(msg, terms, notTerms)
return guess
def isCoder(msg):
terms = ['import', 'iohub', 'script']
notTerms = ['builder'] # indicates ambiguous text
guess = is_(msg, terms, notTerms)
return guess
# store topic names that we already categorised
# or we could end up in different categories for a single topic
knownTopics = {}
# just for info to keep track
nBuilder=0
nCoder = 0
nUnknown = 0
nDev = 0
# do the actual work
for search in toSearch:
# set the location and term for this search
client.select(search['mbox']) # select that mailbox/folder
searchPhrase = '(%s SINCE "%s" BEFORE "%s")' %(search['term'], fromDate, toDate)
status, response = client.search(None, searchPhrase)
# then loop over results
msgIDs = response[0].split()
for msgID in msgIDs:
if msgID in sent:
continue
status, email_data = client.fetch(msgID, "(RFC822)")
env, msg = email_data[0]
message = email.message_from_string(msg)
subj = message['Subject']
#try to determine target
if 'dev' in search['term']:
target = "psychopy+dev@discoursemail.com"
nDev += 1
elif isBuilder(msg):
target = "psychopy+builder@discoursemail.com"
nBuilder += 1
elif isCoder(msg):
target = "psychopy+coder@discoursemail.com"
nCoder += 1
else:
target = "psychopy+other@discoursemail.com"
nUnknown += 1
knownTopics[subj[-15:]] = target
# make sure the "To:" field matches the target address
message.replace_header("To", target)
print("%s: %s - %s" %(msgID, message['From'], message['Subject']))
print(" -> %s" %(target))
if not testing:
outbox.sendmail(message['From'], target, message.as_string())
time.sleep(2.0) # could adjust to slow this down and look more "human"?
sent.append(msgID)
print("Builder=%i, Coder=%i, Dev=%i, Unknown=%i"
%(nBuilder, nCoder, nDev, nUnknown))
print(sent) # could use this to store handled messages for next run