First, the necessary libraries are imported, a helper function is defined and inputs are recieved. --- import requests from bs4 import BeautifulSoup from generic_transform import * def year_in_bounds(year, minYear, maxYear): try: year = int(year) except: year = 0 if minYear and year < minYear: return False if maxYear and year > maxYear: return False return True # load input phrase and parameters e_type,e_values,params = get_from_args() query = e_values["value"] # get passed parameters or defaults into variables # minYear/maxYear == 0 indicates no limit on that bound numResults = 10 minYear = 0 maxYear = 0 if "numResults" in params: numResults = int(params["numResults"]) if "minYear" in params: minYear = int(params["minYear"]) if "maxYear" in params: maxYear = int(params["maxYear"]) --- Then, HTTP GET request parameters are initialized. Here is where the basic HTTP knowledge comes in - those parameters will be different for every website and it's neccesary to find them out by experimenting. After that a while loop is entered which will read through at most 4 web pages of results containing 50 results each. It is necessary to impose such a limit, because the given input criteria might never be satisfied and the search may go on endlessly. --- # both the query and the number of results are sent in the HTTP GET request par = {"q": query, "fa": "original-format:book", "all": "true", "c": "50"} page_counter = 1 while page_counter <= 4: --- Then, the next page of results is retrieved using the Requests library, and it's content is parsed using the BeautifulSoup library. --- par["sp"] = page_counter req = requests.get("http://www.loc.gov/search/", params = par) soup = BeautifulSoup(req.text) --- After that, it's necessary to navigate the HTML tree to extract the needed information. It is usually done by iterating through every parent tag that every section describing an entity has. Then, the information itself is retrieved by navigating relatively to that parent tag and saving it in the required format. Once the information is gathered, there is a check to see if it fulfills the necessary criteria after which, if successful, the result is outputed. After the required number of entities is gathered, the transform exits. --- for parent_tag in soup.find_all(class_ = "description"): # navigate to the title tag and get the string # the string ocasionally has trailing whitespace and punctuation (/,.), so remove that too title_tag = parent_tag.h2.a title_string = title_tag.text.strip() while not title_string[-1:].isalnum(): title_string = title_string[:-1] # title_tag is actually an tag which gives us the url # it's in this format: //lccn.loc.gov/, so we just add "http:" in front url_string = "http:" + title_tag["href"] # navigate to the authors tag and check if it exists # if it does, get the string, it's already in the correct format, otherwise leave it empty authors_tag = parent_tag.find(class_ = "contributor") if authors_tag is not None: authors_string = authors_tag.span.text else: authors_string = "" # navigate to the date tag and check if it exists # if it does, get the string, it's also in the correct format, otherwise leave it empty date_tag = parent_tag.find(class_ = "date") if date_tag is not None: date_string = date_tag.span.text else: date_string = "" # if there are bounds on article year, check them if not year_in_bounds(date_string, minYear, maxYear): continue # write the final result write_result("book", {"value": title_string, "authors": authors_string, "date": date_string, "url": url_string}) numResults -= 1 if numResults == 0: sys.exit(0) ---