0%

using pyppetter to scrap dynamic javascript web content

pyppetter

1
2
3
4
# python3 -c 'import pyppeteer; pyppeteer.chromium_downloader.download_chromium()'

# [W:pyppeteer.chromium_downloader] chromium extracted to: /home/ubuntu/.local/share/pyppeteer/local-chromium/575458

demo 1

1
2
3
4
5
6
7
import requests
from pyquery import PyQuery as pq

url = 'http://quotes.toscrape.com/js/'
response = requests.get(url)
doc = pq(response.text)
print('Quotes:', doc('.quote').length)

0

demo 2

1
2
3
4
5
6
7
8
9
10
11
12
13
import asyncio
from pyppeteer import launch
from pyquery import PyQuery as pq

async def main():
browser = await launch()
page = await browser.newPage()
await page.goto('http://quotes.toscrape.com/js/')
doc = pq(await page.content())
print('Quotes:', doc('.quote').length)
await browser.close()

asyncio.get_event_loop().run_until_complete(main())

10

demo 3

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
import asyncio
from pyppeteer import launch

async def main():
browser = await launch()
page = await browser.newPage()
await page.goto('http://quotes.toscrape.com/js/')
await page.screenshot(path='example.png')
await page.pdf(path='example.pdf')
dimensions = await page.evaluate('''() => {
return {
width: document.documentElement.clientWidth,
height: document.documentElement.clientHeight,
deviceScaleFactor: window.devicePixelRatio,
}
}''')

print(dimensions)
# >>> {'width': 800, 'height': 600, 'deviceScaleFactor': 1}
await browser.close()

asyncio.get_event_loop().run_until_complete(main())

demo 4

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
#!/usr/bin/python
# -*- coding: UTF-8 -*-

import asyncio
from pyppeteer import launch

async def main():
# https://github.com/miyakogi/pyppeteer/issues/171
browser = await launch({
'args': ['--no-sandbox'],
}
)
page = await browser.newPage()
url="http://127.0.0.1:5000/tool?1784076_1_1_1_1_1"
await page.goto(url)
#await page.waitFor(3000) # wait for 3 seconds

#===================================================
# JS逻辑, 如果echart图像render finished,则在html页面中动态添加echartReadyDiv
# 表示图像ready,可以download/send to server.
await page.waitForSelector('#echartReadyDiv')
#===================================================

# 页面渲染完毕后,开始截图
# 如果没有加载完毕就生产image了,内容不完整
await page.screenshot(path='example.png')
#await page.pdf(path='example.pdf')
dimensions = await page.evaluate('''() => {
return {
width: document.documentElement.clientWidth,
height: document.documentElement.clientHeight,
deviceScaleFactor: window.devicePixelRatio,
}
}''')

print(dimensions)
# >>> {'width': 800, 'height': 600, 'deviceScaleFactor': 1}
await page.close()
await browser.close()

asyncio.get_event_loop().run_until_complete(main())

Page APIs:

  • page.waitFor(3000) # wait for 3 seconds
  • page.waitForSelector(‘#echartReadyDiv’) # wait for selector
  • page.page.waitForXPath(‘//*[@id=”echartReadyDiv”]’) # wait for xpath

Reference

History

  • 2020/1/17: created.