# Code attribution: Yiyin Shen, Tyler Caraza-Harter
# Imports
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import matplotlib.pyplot as plt
import heapq

# Get the button and input and output fields
service = Service(executable_path="chromedriver-win64/chromedriver.exe")
driver = webdriver.Chrome(service=service)
url = "https://pages.cs.wisc.edu/~yw/CS320F23THH1.html"
driver.get(url)
print("DONE")

DONE

# Get the links on the page
def get_links():
    links = driver.find_elements("tag name", "a")
    pages = list(filter(lambda x : x.text == "Link", links))
    hrefs = []
    for i in range(len(pages)):
        span = driver.find_element("id", "h" + str(i + 1))
        h = float(span.text)
        hrefs.append((h, pages[i].get_attribute("href")))
    return hrefs

get_links()

[(3.0, 'https://pages.cs.wisc.edu/~yw/CS320F23THH2.html'),
 (3.0, 'https://pages.cs.wisc.edu/~yw/CS320F23THH3.html'),
 (3.0, 'https://pages.cs.wisc.edu/~yw/CS320F23THH4.html')]

# Check if there is an additional image on the page
def goal_check():
    image = driver.find_elements("tag name", "img")
    if len(image) == 2:
        image[0].screenshot("image_gbs.png")
        print(image[0].get_attribute("src"))
        plt.imshow(plt.imread("image_gbs.png"))
        return True
    return False

goal_check()

False

# GBS to find the goal page
def crawl(url):
    visited = set()
    queue = [(0, url)]
    found = False
    count = 0 # avoid infinite loop
    while count < 100:
        count = count + 1
        current = heapq.heappop(queue)
        if not current[1] in visited:
            visited.add(current[1])
            driver.get(current[1])
            print(current)
            found = goal_check()
            if found:
                return current[1]
            links = get_links()
            for link in links:
                if not link[1] in visited:
                    heapq.heappush(queue, link)
        if len(queue) == 0:
            break

url = "https://pages.cs.wisc.edu/~yw/CS320F23THH1.html"
crawl(url)

(0, 'https://pages.cs.wisc.edu/~yw/CS320F23THH1.html')
(3.0, 'https://pages.cs.wisc.edu/~yw/CS320F23THH2.html')
(2.0, 'https://pages.cs.wisc.edu/~yw/CS320F23THH5.html')
(1.0, 'https://pages.cs.wisc.edu/~yw/CS320F23THH11.html')
(0.0, 'https://pages.cs.wisc.edu/~yw/CS320F23THH23.html')
https://pages.cs.wisc.edu/~yw/CS320/amongus.png

'https://pages.cs.wisc.edu/~yw/CS320F23THH23.html'

# Remember to quit when it's done
driver.quit()