Beautifulsoup 4
Parsing a Page
import bs4 as bs
import urllib.request

source = urllib.request.urlopen('https://lin-chen-va.github.io/') # request webpage
content = source.read() # read response

soup = bs.BeautifulSoup(content, 'html.parser') # parse page

print(soup.prettify())
		
Search Children
import bs4 as bs
import urllib.request

source = urllib.request.urlopen('https://lin-chen-va.github.io/')
content = source.read()

soup = bs.BeautifulSoup(content, 'html.parser')

html = soup.find_all('html') # find html tag
html_children = list(html[0].children)

print(html_children[1]) # header
print(html_children[3]) # body
		
Finding all instances of a tag
import bs4 as bs
import urllib.request

source = urllib.request.urlopen('https://lin-chen-va.github.io/')
content = source.read()

soup = bs.BeautifulSoup(content, 'html.parser')

sections = soup.find_all('section') # find html tag

print(sections[0])
		
Searching for tags by class
import bs4 as bs
import urllib.request

source = urllib.request.urlopen('https://lin-chen-va.github.io/')
content = source.read()

soup = bs.BeautifulSoup(content, 'html.parser')

majors = soup.find_all(class_='major') # find html tag by class name

for major in majors:
    print(major)
		
Searching for tag by id
import bs4 as bs
import urllib.request

source = urllib.request.urlopen('https://lin-chen-va.github.io/')
content = source.read()

soup = bs.BeautifulSoup(content, 'html.parser')

teaching = soup.find_all(id='teaching') # find html tag by class name

print(teaching[0])
		
Searching for tag by attributes
scripts = item.find_all(attrs={"type" : "application/ld+json"})
		
Searching Using CSS Selectors
import bs4 as bs
import urllib.request

source = urllib.request.urlopen('https://lin-chen-va.github.io/')
content = source.read()

soup = bs.BeautifulSoup(content, 'html.parser')

ps = soup.select('section p') # Using CSS Selector

for p in ps:
    print(p)
		
Reference
  • Tutorial