使用scrapy爬取新闻

Item

class AifranItem(scrapy.Item):
    # define the fields for your item here like:
    img = scrapy.Field()
    title = scrapy.Field()
    url = scrapy.Field()
    date = scrapy.Field()
    create_at = scrapy.Field()
    source = scrapy.Field()
    pass

spider

import json
from asyncio import sleep
import time
from telnetlib import EC

import scrapy
from scrapy import Selector, Request
from selenium import webdriver

from ..items import AifranItem, Ygmoive_info_Item, YgmoiveItem, Ygmoive_pages_Item

class IfranSpider(scrapy.Spider):
    name = 'readhub_spider'
    # all_province = []
    start_urls = [
        'https://api.readhub.cn/topic?lastCursor=&pageSize=20'
    ]

    def __init__(self):
        pass
        # self.chromedriver_path = "D:\chromedriver\chromedriver.exe"
        # self.browser = webdriver.Chrome(self.chromedriver_path)

    def closed(self, spider):
        # self.browser.close()
        print('spider closed')

    def parse(self, response):
        # print(response.text)
        data_list = json.loads(response.text)
        print('===========')
        # print(data_list)
        # print(data_list['data'])

        for i in data_list['data']:
            # print(i)
            # print(i['id'])
            item = AifranItem()
            item['img'] = ''
            item['title'] = i['newsArray'][0]['title']
            item['url'] = 'https://readhub.cn/topic/'+i['id']
            item['date'] = time.strftime("%Y-%m-%d", time.localtime())
            item['create_at'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            item['source'] = 'readhub'
            yield item
        print('===========')
        pass
    pass

把爬虫运行起来然后在shell命令运行，设置定时每隔三十分钟爬一次

Scrapy 爬取新闻

Item

spider

个人博客

By cc

Related Post

发表回复取消回复

更多文章

阿里的 binlog 的增量订阅和消费组件

mysql存储引擎InnoDB和MyISAM

数据结构的理解

git的不常用方法

Scrapy 爬取新闻

Item

spider

个人博客

By cc

Related Post

发表回复 取消回复

更多文章

阿里的 binlog 的增量订阅和消费组件

mysql存储引擎InnoDB和MyISAM

数据结构的理解

git的不常用方法

发表回复取消回复