In [1]:
# Code attribution: Yiyin Shen, Tyler Caraza-Harter
# Imports
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import matplotlib.pyplot as plt
import heapq
In [2]:
# Get the button and input and output fields
service = Service(executable_path="chromedriver-win64/chromedriver.exe")
driver = webdriver.Chrome(service=service)
url = "https://pages.cs.wisc.edu/~yw/CS320F23THH1.html"
driver.get(url)
print("DONE")
DONE
In [3]:
# Get the links on the page
def get_links():
links = driver.find_elements("tag name", "a")
pages = list(filter(lambda x : x.text == "Link", links))
hrefs = []
for i in range(len(pages)):
span = driver.find_element("id", "h" + str(i + 1))
h = float(span.text)
hrefs.append((h, pages[i].get_attribute("href")))
return hrefs
get_links()
Out[3]:
[(3.0, 'https://pages.cs.wisc.edu/~yw/CS320F23THH2.html'), (3.0, 'https://pages.cs.wisc.edu/~yw/CS320F23THH3.html'), (3.0, 'https://pages.cs.wisc.edu/~yw/CS320F23THH4.html')]
In [4]:
# Check if there is an additional image on the page
def goal_check():
image = driver.find_elements("tag name", "img")
if len(image) == 2:
image[0].screenshot("image_gbs.png")
print(image[0].get_attribute("src"))
plt.imshow(plt.imread("image_gbs.png"))
return True
return False
goal_check()
Out[4]:
False
In [5]:
# GBS to find the goal page
def crawl(url):
visited = set()
queue = [(0, url)]
found = False
count = 0 # avoid infinite loop
while count < 100:
count = count + 1
current = heapq.heappop(queue)
if not current[1] in visited:
visited.add(current[1])
driver.get(current[1])
print(current)
found = goal_check()
if found:
return current[1]
links = get_links()
for link in links:
if not link[1] in visited:
heapq.heappush(queue, link)
if len(queue) == 0:
break
url = "https://pages.cs.wisc.edu/~yw/CS320F23THH1.html"
crawl(url)
(0, 'https://pages.cs.wisc.edu/~yw/CS320F23THH1.html') (3.0, 'https://pages.cs.wisc.edu/~yw/CS320F23THH2.html') (2.0, 'https://pages.cs.wisc.edu/~yw/CS320F23THH5.html') (1.0, 'https://pages.cs.wisc.edu/~yw/CS320F23THH11.html') (0.0, 'https://pages.cs.wisc.edu/~yw/CS320F23THH23.html') https://pages.cs.wisc.edu/~yw/CS320/amongus.png
Out[5]:
'https://pages.cs.wisc.edu/~yw/CS320F23THH23.html'
In [6]:
# Remember to quit when it's done
driver.quit()