网站首页  汉语字词  英语词汇  考试资料  写作素材  旧版资料

请输入您要查询的考试资料:

 

标题 python实现的一个火车票转让信息采集器
内容
    这篇文章主要介绍了python实现的一个火车票转让信息采集器,采集信息来源是58同程或者赶集网,需要的朋友可以参考下。
    #coding: utf-8
    '''
    春运查询火车票转让信息
    author: piglei2007@gmail.com
    date: 2011.01.25
    '''
    import re
    import os
    import time
    import urlparse
    import datetime
    import traceback
    import urllib2
    import socket
    socket.setdefaulttimeout(20)
    blank_re = re.compile(r\s+)
    opener = urllib2.build_opener(urllib2.httpcookieprocessor())
    opener.addheaders = [
      (user-agent, mozilla/5.0 (x11; u; freebsd i386; en-us; rv:1.9.1) gecko/20090704 firefox/3.5),
      (accept, */*),
    ]
    urllib2.install_opener(opener)
    from beautifulsoup import beautifulsoup
    source = {
      58: http://bj.58.com/huochepiao/?num=%(train)s&starttime=%(date)s00,
      ganji: http://bj.ganji.com/piao/cc_%(train)s/%(date)s/,
    }
    record_file = /tmp/ticket_records.txt
    def parse_record():
      try:
        return set([x.strip() for x in open(record_file, r).readlines()])
      except ioerror:
        open(record_file, w)
        return set()
    def flush_record(records):
      open(record_file, w).write(\n.join(records))
    def main(config):
      开始抓取
      existed = parse_record()
      to_email = []
      for train in config[trains]:
        for date in config[dates]:
          for type, _url in source.items():
            url = _url % dict(train=train, date=date)
            content = urllib2.urlopen(url).read()
            soup = beautifulsoup(content)
            result = parse_content(type, soup, train)
            for url, text in result:
              url = urlparse.urljoin(_url, url)
              # 只要卧铺!
              if url not in existed and u卧 in text:
                to_email.append([text, url])
              existed.add(url)
      if to_email:
        content = .join(
          [x for x in [ | .join(y) for y in to_email]]
        ).encode(utf-8)
        simple_mail(config[people], content)
      flush_record(existed)
    def parse_content(type, soup, train):
      获得车次信息
      result = []
      if type == 58:
        info_table = soup.find(table, id=infolist)
        if info_table:
          for x in info_table.findall(tr, text=re.compile(ur%s(?!时刻表) % train, re.i)):
            a = x.parent
            _text = blank_re.sub(, a.text)
            result.append([a[href], _text])
      if type == ganji:
        for x in soup.findall(dl, {class: list_piao}):
          a = x.dt.a
          result.append([a[href], a.text])
      return result
    email_host = 'smtp.sohu.com'
    email_host_user = 'yourname@sohu.com'
    email_host_password = 'yourpassword'
    email_port = 25
    def simple_mail(to, content):
      发送邮件
      import smtplib
      from email.mime.text import mimetext
      msgroot = mimetext(content, 'html', 'utf-8')
      msgroot['subject'] = [%s]有票来啦!!!! % datetime.datetime.today().isoformat( )
      msgroot['from'] = email_host_user
      msgroot['to'] = , .join(to)
      s = smtplib.smtp(email_host, email_port)
      s.login(email_host_user, email_host_password)
      s.sendmail(email_host_user, to, msgroot.as_string())
      s.close()
    def switch_time_zone():
      切换时区
      os.environ[tz] = asia/shanghai
      time.tzset()
    switch_time_zone()
    if __name__ == '__main__':
      config = {
        trains: (k471,),
        dates: (20110129,),
        people: (
          youremail@sohu.com,
        )
      }
      try:
        main(config)
        print %s: ok % datetime.datetime.today()
      except exception, e:
        print traceback.format_exc()然后放入cron,你懂的。
随便看

 

在线学习网考试资料包含高考、自考、专升本考试、人事考试、公务员考试、大学生村官考试、特岗教师招聘考试、事业单位招聘考试、企业人才招聘、银行招聘、教师招聘、农村信用社招聘、各类资格证书考试等各类考试资料。

 

Copyright © 2002-2024 cuapp.net All Rights Reserved
更新时间:2025/5/21 2:52:43