How to fetch information from a website in python

python
webscraping
selenium
playwright
Is Playwright any better than Selenium?
Author

Jakob Johannesson

Published

5/6/2022


Page is under construction

Simple example

Open a browser, create a new page, go to an url.

from playwright.sync_api import sync_playwright
from bs4 import BeautifulSoup

# source env/bin/activate


with sync_playwright() as p:
    browser = p.webkit.launch(headless=False, slow_mo=50)
    page = browser.new_page()
    page.goto("https://superkaka.se")
    page.locator("a:has-text(\"About\")").click()
    html=page.inner_html('.content')
    soup=BeautifulSoup(html, 'html.parser')
    #print(soup.find_all("p"))
    stuff=soup.find('h2', {'class': 'anchored'}).text
    print(f'Here is what is under  {stuff}')
    browser.close()

Super simple :)

Example with a login

Go to a specific page, wait on table to load, scrape the table.

from playwright.sync_api import sync_playwright
import pandas as pd

with sync_playwright() as p:
    browser = p.webkit.launch(headless=False, slow_mo=50)
    page = browser.new_page()
    page.goto("https://appforiarteam.shinyapps.io/Shiny_Plotly/")
    page.fill("input#userName", "test")
    page.fill("input#passwd", "test2")
    page.click("button[id=Login]")
    page.click("a[href='#shiny-tab-data_vis']")
    
    # We wait for the table to load by clicking on the table when it appears
    page.click("tr[role='row']")
    

    html=page.inner_html(".wrapper")
    df=pd.read_html(html)
    print(df)
    browser.close()