爬取elasticsearch英文文档

直接上代码

# -*- coding: utf-8 -*-

import requests
from lxml import etree
from bs4 import BeautifulSoup

tocUrl = "https://www.elastic.co/guide/en/elasticsearch/reference/current/toc.html"

r = requests.get(tocUrl)

soup = BeautifulSoup(r.content, 'html.parser')


basePageUrl = "https://www.elastic.co/guide/en/elasticsearch/reference/current/"

# 获取所有的连接页面
for k in soup.find_all('a'):
    fullUrl = basePageUrl + k['href']
    print(fullUrl)
    resp = requests.get(fullUrl)
    con = etree.HTML(resp.content)
    part = con.xpath('//div[@class="part"]/div/*/text() | //div[@class="section"]/*/text() | //div[@class="chapter"]/*/text() | //div[@class="xpack section"]/*/text() ')
    f = open(k['href'], 'w+', encoding="utf-8")
    for t in part:
        # print(t.replace("\r\n", " "))
        f.write(t.replace("\r\n", " "))
    f.close()

最后更新于