4.2.1.1.1. 爬取Elasticsearch 英文文档内容

直接上代码

# -*- coding: utf-8 -*-

import requests
from lxml import etree
from bs4 import BeautifulSoup

tocUrl = "https://www.elastic.co/guide/en/elasticsearch/reference/current/toc.html"

r = requests.get(tocUrl)

soup = BeautifulSoup(r.content, 'html.parser')


basePageUrl = "https://www.elastic.co/guide/en/elasticsearch/reference/current/"

# 获取所有的连接页面
for k in soup.find_all('a'):
    fullUrl = basePageUrl + k['href']
    print(fullUrl)
    resp = requests.get(fullUrl)
    con = etree.HTML(resp.content)
    part = con.xpath('//div[@class="part"]/div/*/text() | //div[@class="section"]/*/text() | //div[@class="chapter"]/*/text() | //div[@class="xpack section"]/*/text() ')
    f = open(k['href'], 'w+', encoding="utf-8")
    for t in part:
        # print(t.replace("\r\n", " "))
        f.write(t.replace("\r\n", " "))
    f.close()
Copyright © www.ainiok.com 2018 all right reserved,powered by Gitbook修订时间: 2019-12-05 17:44:13

results matching ""

    No results matching ""