using pyppetter to scrap dynamic javascript web content


pyppetter

# python3 -c 'import pyppeteer; pyppeteer.chromium_downloader.download_chromium()'

# [W:pyppeteer.chromium_downloader] chromium extracted to: /home/ubuntu/.local/share/pyppeteer/local-chromium/575458

demo 1

import requests
from pyquery import PyQuery as pq

url = 'http://quotes.toscrape.com/js/'
response = requests.get(url)
doc = pq(response.text)
print('Quotes:', doc('.quote').length)

0

demo 2

import asyncio
from pyppeteer import launch
from pyquery import PyQuery as pq

async def main():
    browser = await launch()
    page = await browser.newPage()
    await page.goto('http://quotes.toscrape.com/js/')
    doc = pq(await page.content())
    print('Quotes:', doc('.quote').length)
    await browser.close()

asyncio.get_event_loop().run_until_complete(main())

10

demo 3

import asyncio
from pyppeteer import launch

async def main():
    browser = await launch()
    page = await browser.newPage()
    await page.goto('http://quotes.toscrape.com/js/')
    await page.screenshot(path='example.png')
    await page.pdf(path='example.pdf')
    dimensions = await page.evaluate('''() => {
        return {
            width: document.documentElement.clientWidth,
            height: document.documentElement.clientHeight,
            deviceScaleFactor: window.devicePixelRatio,
        }
    }''')

    print(dimensions)
    # >>> {'width': 800, 'height': 600, 'deviceScaleFactor': 1}
    await browser.close()

asyncio.get_event_loop().run_until_complete(main())

demo 4

#!/usr/bin/python
# -*- coding: UTF-8 -*-

import asyncio
from pyppeteer import launch

async def main():
    # https://github.com/miyakogi/pyppeteer/issues/171
    browser = await launch({
            'args': ['--no-sandbox'],
        }
    )
    page = await browser.newPage()
    url="http://127.0.0.1:5000/tool?1784076_1_1_1_1_1"
    await page.goto(url)
    #await page.waitFor(3000) # wait for 3 seconds

    #===================================================
    # JS逻辑, 如果echart图像render finished,则在html页面中动态添加echartReadyDiv
    # 表示图像ready,可以download/send to server.
    await page.waitForSelector('#echartReadyDiv')
    #===================================================

    # 页面渲染完毕后,开始截图
    # 如果没有加载完毕就生产image了,内容不完整
    await page.screenshot(path='example.png') 
    #await page.pdf(path='example.pdf')
    dimensions = await page.evaluate('''() => {
        return {
            width: document.documentElement.clientWidth,
            height: document.documentElement.clientHeight,
            deviceScaleFactor: window.devicePixelRatio,
        }
    }''')

    print(dimensions)
    # >>> {'width': 800, 'height': 600, 'deviceScaleFactor': 1}
    await page.close()
    await browser.close()

asyncio.get_event_loop().run_until_complete(main())

Page APIs:

  • page.waitFor(3000) # wait for 3 seconds
  • page.waitForSelector(‘#echartReadyDiv’) # wait for selector
  • page.page.waitForXPath(‘//*[@id=”echartReadyDiv”]’) # wait for xpath

Reference

History

  • 2020/1/17: created.

Author: kezunlin
Reprint policy: All articles in this blog are used except for special statements CC BY 4.0 reprint polocy. If reproduced, please indicate source kezunlin !
评论
  TOC