Beautifulsoup 4
Parsing a Page
import bs4 as bs
import urllib.request
source = urllib.request.urlopen('https://lin-chen-va.github.io/') # request webpage
content = source.read() # read response
soup = bs.BeautifulSoup(content, 'html.parser') # parse page
print(soup.prettify())
Search Children
import bs4 as bs
import urllib.request
source = urllib.request.urlopen('https://lin-chen-va.github.io/')
content = source.read()
soup = bs.BeautifulSoup(content, 'html.parser')
html = soup.find_all('html') # find html tag
html_children = list(html[0].children)
print(html_children[1]) # header
print(html_children[3]) # body
Finding all instances of a tag
import bs4 as bs
import urllib.request
source = urllib.request.urlopen('https://lin-chen-va.github.io/')
content = source.read()
soup = bs.BeautifulSoup(content, 'html.parser')
sections = soup.find_all('section') # find html tag
print(sections[0])
Searching for tags by class
import bs4 as bs
import urllib.request
source = urllib.request.urlopen('https://lin-chen-va.github.io/')
content = source.read()
soup = bs.BeautifulSoup(content, 'html.parser')
majors = soup.find_all(class_='major') # find html tag by class name
for major in majors:
print(major)
Searching for tag by id
import bs4 as bs
import urllib.request
source = urllib.request.urlopen('https://lin-chen-va.github.io/')
content = source.read()
soup = bs.BeautifulSoup(content, 'html.parser')
teaching = soup.find_all(id='teaching') # find html tag by class name
print(teaching[0])
Searching for tag by attributes
scripts = item.find_all(attrs={"type" : "application/ld+json"})
Searching Using CSS Selectors
import bs4 as bs
import urllib.request
source = urllib.request.urlopen('https://lin-chen-va.github.io/')
content = source.read()
soup = bs.BeautifulSoup(content, 'html.parser')
ps = soup.select('section p') # Using CSS Selector
for p in ps:
print(p)
Reference