【Python 3.7 + BeautifulSoup 4 :豆瓣 Top 250 爬虫】

  • 输出 .csv 文件,如图:

    import requests
    import csv
    import random
    import time
    import socket
    import http.client
    from bs4 import BeautifulSoup
    
    def get_html(url, data = None):
        header = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7',
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
        }
        timeout = random.choice(range(80, 100))
        while True:
            try:
                response = requests.get(url, headers = header, timeout = timeout)
                response.encoding = 'utf-8'
                break
            except socket.timeout as e:
                print(e)
                time.sleep(random.choice(range(20, 60)))
            except socket.error as e:
                print(e)
                time.sleep(random.choice(range(0, 60)))
            except http.client.BadStatusLine as e:
                print(e)
                time.sleep(random.choice(range(30, 60)))
            except http.client.IncompleteRead as e:
                print(e)
                time.sleep(random.choice(range(20, 60)))
        return response.text
        
    def get_data(html_text, rank):
        result = []
        bs = BeautifulSoup(html_text, "html.parser")
        content = bs.find_all('div', {'class': 'info'})
        for movie in content:
            temp = []
            temp.append(rank)
            rank += 1
            for span in movie.find_all('span','title'):
                temp.append(span.text.replace('/', ''))
                if len(movie.find_all('span','title')) == 1:
                    temp.append(movie.find('span','other').text.replace('/', ''))
            for span in movie.find_all('span','rating_num'):
                temp.append(span.text)
            temp.append(movie.find('p').text)
            a_tag = movie.find('a')
            temp.append(a_tag['href'])
            result.append(temp)
        return result
        
    def data_output(data, filename):
        with open(filename, 'a', errors='ignore', newline='') as f:
            f_csv = csv.writer(f)
            f_csv.writerows(data)
            
    if __name__ == '__main__':
        for i in range(0, 250, 25):
            url = 'https://movie.douban.com/top250?start=' + str(i)
            html = get_html(url)
            result = get_data(html, i+1)
            data_output(result, 'top250.csv')
  • 【Python 3.7 + BeautifulSoup 4 + Scrapy :简单爬虫实例】

  • 放假无聊,无聊写个爬虫,把本博客的文章列表页面(https://qxx.hk/blog)扒下来

    1. Python 3.7 + BeautifulSoup 4

  • 输出 .csv 文件,如图:

  • 代码:
    import requests
    import csv
    import random
    import time
    import socket
    import http.client
    from bs4 import BeautifulSoup
    
    def get_html(url, data = None):
        header = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7',
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
        }
        timeout = random.choice(range(80, 100))
        while True:
            try:
                response = requests.get(url, headers = header, timeout = timeout)
                response.encoding = 'utf-8'
                break
            except socket.timeout as e:
                print(e)
                time.sleep(random.choice(range(20, 60)))
            except socket.error as e:
                print(e)
                time.sleep(random.choice(range(0, 60)))
            except http.client.BadStatusLine as e:
                print(e)
                time.sleep(random.choice(range(30, 60)))
            except http.client.IncompleteRead as e:
                print(e)
                time.sleep(random.choice(range(20, 60)))
        return response.text
        
    def get_data(html_text):
        result = []
        bs = BeautifulSoup(html_text, "html.parser")
        content = bs.find_all('div', {'class': 'brick'})
        for blog in content:
            temp = []
            a_tag = blog.find('a')
            a_tag.span.extract()
            temp.append(a_tag['href'])
            temp.append(a_tag.text)
            result.append(temp)
        
        return result
        
    def data_output(data, filename):
        with open(filename, 'a', errors='ignore', newline='') as f:
            f_csv = csv.writer(f)
            f_csv.writerows(data)
            
    if __name__ == '__main__':
        url = 'https://qxx.hk/blog'
        html = get_html(url)
        result = get_data(html)
        data_output(result, 'blog.csv')

    2.Python 3.7 + Scrapy

  • 输出 .json 文件,如图:

    #myblog_spider.py
    import scrapy
    from myblog_spider.items import MyblogSpiderItem
    
    class myBlogSpider(scrapy.Spider):
    	name = 'myBlogSpider'
    	allow_domains = ['qxx.hk']
    	start_urls = ["https://qxx.hk/blog"]
    
    	def parse(self, response):
    		for blog in response.xpath('//div[@class="brick"]/a'):
    			item = MyblogSpiderItem()
    			item['url'] = blog.xpath('@href').extract_first()
    			item['title'] = blog.xpath('text()').extract_first()
    			yield item
    #items.py
    import scrapy
    
    class MyblogSpiderItem(scrapy.Item):
    	# define the fields for your item here like:
    	# name = scrapy.Field()
    	title = scrapy.Field()
    	url = scrapy.Field()
    	pass
    pipelines.py
    import codecs
    import json
    
    class MyblogSpiderPipeline(object):
    	def __init__(self):        
    		self.file = codecs.open('logs.json', 'w', encoding='utf-8')
    	def process_item(self, item, spider):
    		line = json.dumps(dict(item), ensure_ascii=False) + "\n"
    		self.file.write(line)
    		return item
    	def spider_closed(self, spider):
    		self.file.close()