To scrap a website using requests and bs4 in python3.
The requests library will make a GET request to a web server.
It downloads the HTML contents of a given web page.
This library was used to extract the text from the web page.
Used for parsing the HTML content in a web page.
#import libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
#base url which you want to scrap
base_url = (‘https://en.wikipedia.org
/wiki/List_of_state_and_union_
territory_capitals_in_India’)
#Make a get request to server
r = requests.get(base_url)
#check the server response
print(r)
#initialize the soup object
soup = BeautifulSoup(r.text, ‘html.parser’)
#define the HTML table and class
table=soup.find(‘table’, class_
=’wikitable sortable plainrowheaders’)
#declare empty lists
list1=[]
list2=[]
list3=[]
#make a simple loop
for row in table.findAll(“tr”):
table_data = row.findAll(‘td’)
#to store second column data
table_head = row.findAll(‘th’)
#only extract table body not heading
if len(table_data)==6:
list1.append(table_data[0].find(text=True))
list2.append(table_head[0].find(text=True))
list3.append(table_data[1].find(text=True))
print(“\n”)
#store it in a data frame
df=pd.DataFrame(list1,columns=[‘Number’])
df[‘States/UT’]=list2
df[‘Capital’]=list3
#print the data frame
print(df)