Used API instead of scraping to find courses from year

This commit is contained in:
Boyan 2024-11-18 20:05:12 +01:00
parent da4705b56a
commit 52d9d86260

View File

@ -1,71 +1,70 @@
from bs4 import BeautifulSoup
from .course import Course from .course import Course
from .exceptions.course_unavailable import CourseUnavailable from bs4 import BeautifulSoup
class Year: class Year:
""" """
Represents an academic year. Represents an academic year.
""" """
def __init__(self, session, year_path: str):
self.session = session
self.year_path = year_path # e.g., '2023-2024'
self.base_url = "https://themis.housing.rug.nl"
self.api_url = f"{self.base_url}/api/navigation/{self.year_path}"
def __init__(self, session, start_year: int, end_year: int): def all_courses(self) -> list:
self.start = start_year """
self.year = end_year Gets all visible courses in this year.
self.url = f"https://themis.housing.rug.nl/course/{self.start}-{self.year}" """
self._session = session response = self.session.get(self.api_url)
if response.status_code != 200:
raise ConnectionError(f"Failed to retrieve courses for {self.year_path}.")
def all_courses(self, errors: bool = True) -> list[Course]: courses_data = response.json()
"""
Gets all visible courses in a year.
"""
r = self._session.get(self.url)
soup = BeautifulSoup(r.text, "lxml")
lis = soup.find_all("li", class_="large")
courses = [] courses = []
for li in lis: for course_info in courses_data:
try: if course_info.get("visible", False):
suffix = li.a["href"].replace(f"course/{self.start}-{self.year}", "") course_path = course_info["path"]
course_url = self.url + suffix course_title = course_info["title"]
course_name = li.a.text.strip() courses.append(Course(self.session, course_path, course_title, self))
courses.append(
Course(course_url, course_name, self._session, self)
)
except CourseUnavailable as exc:
if errors:
raise CourseUnavailable(
message=f"Course {li.a.text} in year {self.start}-{self.year} unavailable"
) from exc
print("Error with course", li.a.text)
continue
return courses return courses
def get_course(self, name: str) -> Course: def get_course(self, course_title: str) -> Course:
""" """
Gets a course by name. Gets a course by its title.
""" """
r = self._session.get(self.url) all_courses = self.all_courses()
soup = BeautifulSoup(r.text, "lxml") for course in all_courses:
course_link = soup.find("a", text=name) if course.title == course_title:
if not course_link: return course
raise CourseUnavailable(f"No such course found: {name}") raise ValueError(f"Course '{course_title}' not found in year {self.year_path}.")
suffix = course_link["href"].replace(f"course/{self.start}-{self.year}", "")
course_url = self.url + suffix
return Course(course_url, name, self._session, self)
def get_course_by_url(self, url: str) -> Course: from bs4 import BeautifulSoup
def get_course_by_tag(self, course_tag: str) -> Course:
""" """
Gets a course by url. Gets a course by its tag (course identifier).
Constructs the course URL using the year and course tag.
""" """
r = self._session.get(url) course_path = f"/{self.year_path}/{course_tag}"
soup = BeautifulSoup(r.text, "lxml") course_url = f"{self.base_url}/course{course_path}"
# <a class="fill accent large" href="https://themis.housing.rug.nl/course/2023-2024/adinc-cs">Algorithms and Data Structures for CS</a>
course_link = soup.find_all("a", class_="fill accent large") response = self.session.get(course_url)
name = None if response.status_code != 200:
for link in course_link: raise ConnectionError(f"Failed to retrieve course with tag '{course_tag}' for year {self.year_path}. Tried {course_url}")
if url in link["href"]:
name = link.text soup = BeautifulSoup(response.text, "lxml")
break
title_element = soup.find("h1")
if not name: if not title_element:
raise CourseUnavailable(f"No such course found: {url}") title_elements = soup.find_all("a", class_="fill accent large")
return Course(url, name, self._session, self) if title_elements:
title_element = title_elements[-1]
if title_element:
course_title = title_element.get_text(strip=True)
else:
raise ValueError(f"Could not retrieve course title for tag '{course_tag}' in year {self.year_path}.")
return Course(self.session, course_path, course_title, self)
def __str__(self):
return f"Year({self.year_path})"