标题 | python实现的一个火车票转让信息采集器 |
内容 | 这篇文章主要介绍了python实现的一个火车票转让信息采集器,采集信息来源是58同程或者赶集网,需要的朋友可以参考下。 #coding: utf-8 ''' 春运查询火车票转让信息 author: piglei2007@gmail.com date: 2011.01.25 ''' import re import os import time import urlparse import datetime import traceback import urllib2 import socket socket.setdefaulttimeout(20) blank_re = re.compile(r\s+) opener = urllib2.build_opener(urllib2.httpcookieprocessor()) opener.addheaders = [ (user-agent, mozilla/5.0 (x11; u; freebsd i386; en-us; rv:1.9.1) gecko/20090704 firefox/3.5), (accept, */*), ] urllib2.install_opener(opener) from beautifulsoup import beautifulsoup source = { 58: http://bj.58.com/huochepiao/?num=%(train)s&starttime=%(date)s00, ganji: http://bj.ganji.com/piao/cc_%(train)s/%(date)s/, } record_file = /tmp/ticket_records.txt def parse_record(): try: return set([x.strip() for x in open(record_file, r).readlines()]) except ioerror: open(record_file, w) return set() def flush_record(records): open(record_file, w).write(\n.join(records)) def main(config): 开始抓取 existed = parse_record() to_email = [] for train in config[trains]: for date in config[dates]: for type, _url in source.items(): url = _url % dict(train=train, date=date) content = urllib2.urlopen(url).read() soup = beautifulsoup(content) result = parse_content(type, soup, train) for url, text in result: url = urlparse.urljoin(_url, url) # 只要卧铺! if url not in existed and u卧 in text: to_email.append([text, url]) existed.add(url) if to_email: content = .join( [x for x in [ | .join(y) for y in to_email]] ).encode(utf-8) simple_mail(config[people], content) flush_record(existed) def parse_content(type, soup, train): 获得车次信息 result = [] if type == 58: info_table = soup.find(table, id=infolist) if info_table: for x in info_table.findall(tr, text=re.compile(ur%s(?!时刻表) % train, re.i)): a = x.parent _text = blank_re.sub(, a.text) result.append([a[href], _text]) if type == ganji: for x in soup.findall(dl, {class: list_piao}): a = x.dt.a result.append([a[href], a.text]) return result email_host = 'smtp.sohu.com' email_host_user = 'yourname@sohu.com' email_host_password = 'yourpassword' email_port = 25 def simple_mail(to, content): 发送邮件 import smtplib from email.mime.text import mimetext msgroot = mimetext(content, 'html', 'utf-8') msgroot['subject'] = [%s]有票来啦!!!! % datetime.datetime.today().isoformat( ) msgroot['from'] = email_host_user msgroot['to'] = , .join(to) s = smtplib.smtp(email_host, email_port) s.login(email_host_user, email_host_password) s.sendmail(email_host_user, to, msgroot.as_string()) s.close() def switch_time_zone(): 切换时区 os.environ[tz] = asia/shanghai time.tzset() switch_time_zone() if __name__ == '__main__': config = { trains: (k471,), dates: (20110129,), people: ( youremail@sohu.com, ) } try: main(config) print %s: ok % datetime.datetime.today() except exception, e: print traceback.format_exc()然后放入cron,你懂的。 |
随便看 |
|
在线学习网考试资料包含高考、自考、专升本考试、人事考试、公务员考试、大学生村官考试、特岗教师招聘考试、事业单位招聘考试、企业人才招聘、银行招聘、教师招聘、农村信用社招聘、各类资格证书考试等各类考试资料。