#!/usr/bin/python # Automatically download "Il Fatto Quotidiano", authenticating to the PDF archive # # Copyright: (C) 2009-2010 Stefano Zacchiroli # License: GNU General Public License 3 or above # # Last-Modified: Tue, 27 Jul 2010 08:36:00 +0200 import datetime, mechanize, re, sys USERNAME = "quilatua@email" PASSWORD = "quilatuapassword" LOGIN_URL = 'http://www.ilfattoquotidiano.it/login/?redirect_to=http://www.ilfattoquotidiano.it/abbonati/' def main(): br = mechanize.Browser() br.open(LOGIN_URL) # stage 1: login into PDF archive br.select_form('loginform') br['log'] = USERNAME br['pwd'] = PASSWORD login_res = br.submit() # stage 2: set (today's) date br.select_form(nr=1) date_res = br.submit() # stage 3: retrieve PDF # pdf_res = br.follow_link(url_regex=re.compile(r'openpdf')) now = datetime.datetime.now() pdf_res = br.open('http://www.ilfattoquotidiano.it/openpdf/?n=%s' % \ now.strftime('%Y%m%d')) # stage 4: save PDF # sample content-disposition header: attachment; filename=ilfatto20091008.pdf cdisp = pdf_res._headers['content-disposition'] if cdisp.startswith('attachment; '): fname = cdisp.split()[1].split('=')[1] fname = fname.strip('" \t\r\n') # remove surrounding garbage else: print >> sys.stderr, "W: can't guess filename, saving to ilfatto.pdf" fname = 'ilfatto.pdf' pdf_file = file(fname, 'w') pdf_file.write(pdf_res.get_data()) pdf_file.close() print fname if __name__ == '__main__': main()