Python – Web Scraping In The Real World
Yeah, Web Scraping is super easy in Python, just pip install BeautifulSoup and away you go. Not.
I have done a number of web scraping projects in my data life, and very rarely do i get to use something like BeautifulSoup that makes the project go quick, fast, and easy. Usually it’s because of poorly formatted HTML or other such data embedded in JavaScript that just blows up things like BeautifulSoup.
In the end sometimes it’s just easier to tackle it using other methods.
Below is the complete code I used to take a list of 600 plus Home Depot store numbers, and about 400 plus Lowe’s store numbers, and get all the address data into a RBDMS for reporting purposes.
See all code on GitHub
Some interesting finds. Home Depot was super easy to scrape, HTML was not badly formatted. Lowes was another story. I had to have Python pretend to be a web browser, a normal request via Python was blocked (because of headers). Also be careful, to many requests and they will block you.
''' SQL TABLES NEEDED CREATE TABLE dbo.StoreAddress (StoreNum INT, StoreName NVARCHAR(30), Address1 NVARCAR(30), City NVARCHAR(30), State NVARCHAR(10), PostalCode NVARCHAR(30), Country NVARCHAR(30)) ''' from urllib.request import Request, urlopen import json import pyodbc import time class Address(): #create address class with its attributes def __init__(self, storeNumber): self.storeNumber = str(storeNumber).replace("{'",'').replace("'}",'') self.Address1 = '' self.Address2 = '' self.City = '' self.State = '' self.PostalCode = '' self.Country = '' def set_address1(self, slice, storeName): #simply split what we need for each attribute if storeName == 'THE HOME DEPOT' or storeName == 'HOME DEPOT CANADA': start = '"street":"' end = '"}' self.Address1 = find_between( slice.encode(), start.encode(), end.encode() ) elif storeName == "LOWE'S COMPANIES, INC.": start = slice.split('<li>',1)[-1] end = start.split('</li>',1)[0] self.Address1 = end def set_city(self, slice, storeName): if storeName == 'THE HOME DEPOT' or storeName == 'HOME DEPOT CANADA': start = '"city":"' end = '","street' self.City = find_between( slice.encode(), start.encode(), end.encode() ) elif storeName == "LOWE'S COMPANIES, INC.": start = slice.split('<li>',1)[-1] middle = start.split('<li>',1)[-1] end = middle.split(',',1)[0] self.City = end def set_state(self, slice, storeName): if storeName == 'THE HOME DEPOT' or storeName == 'HOME DEPOT CANADA': start = '"state":"' end = '","country"' self.State = find_between( slice.encode(), start.encode(), end.encode() ) elif storeName == "LOWE'S COMPANIES, INC.": start = slice.split(', ',1)[-1] end = start.split(' ',1)[0] self.State = end def set_postalcode(self, slice, storeName): if storeName == 'THE HOME DEPOT' or storeName == 'HOME DEPOT CANADA': start = '"postalCode":"' end = '"' self.PostalCode = find_between( slice.encode(), start.encode(), end.encode() ) elif storeName == "LOWE'S COMPANIES, INC.": start = slice.split('<li>',1)[-1] middle = start.split('<li>',1)[-1] end = middle.split('</li>',1)[0] end = end[-5:] self.PostalCode = end def set_country(self, slice, storeName): if storeName == 'THE HOME DEPOT' or storeName == 'HOME DEPOT CANADA': start = '"country":"' end = '","city"' self.Country = find_between( slice.encode(), start.encode(), end.encode() ) elif storeName == "LOWE'S COMPANIES, INC.": self.Country = '' def getStoreList(): #go get em' storeList = [] #empty list cnxn = pyodbc.connect('DRIVER={SQL Server};SERVER=YOURSERVER;DATABASE=YOURDATABASE;UID=USERID;PWD=PASSWORD') #connect cursor = cnxn.cursor() #open cursor query = cursor.execute("SELECT StoreNum FROM dbo.StoreAddress WHERE Address1 IS NULL") for row in cursor.fetchall(): #got'em storeList.append({row.StoreNum}) #add'em cnxn.close() #clean up return storeList def buildURL(storeNumber,storeName): #make up full url to hit. if storeName == 'THE HOME DEPOT' or storeName == 'HOME DEPOT CANADA': url = 'https://www.homedepot.com/l/search/' + str(storeNumber).replace("{'",'').replace("'}",'') + '/full/' elif storeName == "LOWE'S COMPANIES, INC.": url = '' return url def makeRequest(url,storeNumber,storeName): if storeName == "LOWE'S COMPANIES, INC.": req = Request('http://lowes.know-where.com/lowes/cgi/site?site=' + str(storeNumber).replace("{'",'').replace("'}",'') + '&design=default&lang=en&option=&mapid=us', headers={'User-Agent': 'Mozilla/5.0'}) html = urlopen(req).read() time.sleep(3) # Lowes is too smart, blocks python and too many requests. elif storeName == "THE HOME DEPOT": with urlopen(url) as response: html = response.read() return html def sliceEm(requestedHTML,storeNumber,storeName): if storeName == 'THE HOME DEPOT' or storeName == 'HOME DEPOT CANADA': start = '"storeId":"' + str(storeNumber).replace("{'",'').replace("'}",'') + '",' end = ',"coordinates":' slice = find_between( requestedHTML, start.encode(), end.encode() ) slice = str(slice).replace("b'","") elif storeName == "LOWE'S COMPANIES, INC.": start = '<li>Store Number:' end = '/ul>' slice = find_between( requestedHTML, start.encode(), end.encode() ) slice = str(slice).replace("b'","") return slice def putAddy(Address1,City,State,PostalCode,Country,storeNumber): cnxn = pyodbc.connect('DRIVER={SQL Server};SERVER=YOURSERVER;DATABASE=YOURDATABASE;UID=USERID;PWD=PASSWORD') #connect cursor = cnxn.cursor() #open cursor query = cursor.execute("UPDATE SysproReporting.dbo.StoreAddress SET Address1 = ?, City = ?, State = ?, PostalCode = ?, Country = ? WHERE StoreNum = ?",Address1,City,State,PostalCode,Country,storeNumber) cnxn.commit() cnxn.close() def find_between( s, first, last ): try: start = s.index( first ) + len( first ) end = s.index( last, start ) return s[start:end] except ValueError: return "" def main(): storeList = getStoreList() #make list of store numbers for storeNumber in storeList: #for each store make url request storeName = "LOWE'S COMPANIES, INC." url = buildURL(storeNumber, storeName) #build url with storeNumber requestedHTML = makeRequest(url,storeNumber, storeName) slice = sliceEm(requestedHTML,storeNumber, storeName) addy = Address(storeNumber) #create instance of address addy.set_address1(slice, storeName) addy.set_city(slice, storeName) addy.set_state(slice, storeName) addy.set_postalcode(slice, storeName) addy.set_country(slice, storeName) putAddy(addy.Address1,addy.City,addy.State,addy.PostalCode,addy.Country,addy.storeNumber) print("finished one") if __name__ == "__main__": main()