- mmls-sync
- Tuesday, February 9th, 2010 at 11:43:50am MST
- import re
- import os
- import os.path
- import getpass
- import urllib.request
- import urllib.parse
- import http.cookiejar
- import html.parser
- # define some global variables
- site_hostname = "mmlsmelaka.mmu.edu.my"
- site_url = "http://" + site_hostname
- site_url_secure = "https://" + site_hostname
- class LoginParser(html.parser.HTMLParser):
- def handle_starttag(self, tag, attrs):
- global mmls_hst, mmls_key
- if tag == "input":
- name = [v for k, v in attrs if k == "name"][0]
- value = ([v for k, v in attrs if k == "value"] or [""])[0]
- if name == "hst":
- mmls_hst = value
- elif name == "key":
- mmls_key = value
- class MMLSParser(html.parser.HTMLParser):
- # prepare our regex
- course_regex = re.compile(r"[A-Z]{3}\d{4}")
- link_regex = re.compile(r"/Student/Courses/coursecontent/courses\.php\?crskid=\w{51}&crdid=\d{10}", re.IGNORECASE)
- roman_regex = re.compile(r"^I{1,3}$", re.IGNORECASE)
- # some local variables
- _handle_a = 0
- _handle_td = 0
- _handle_td_counter = 0
- _handle_td_extract = 0
- # lists
- course_id = []
- course_name = []
- course_link = []
- def handle_starttag(self, tag, attrs):
- if tag == "a":
- href = [v for k, v in attrs if k == "href"][0]
- if href and self.link_regex.match(href):
- # add this link to our course link list
- self.course_link.append(href)
- self._handle_a = 1
- self._handle_td = 1
- if self._handle_td and tag == "td":
- if self._handle_td_counter >= 2:
- # reset flag and counter
- self._handle_td = 0
- self._handle_td_counter = 0
- self._handle_td_extract = 1
- else:
- # increment counter
- self._handle_td_counter += 1
- def handle_data(self, data):
- if self._handle_a:
- # append this to our course name
- self._handle_a = 0
- self.course_id.append(data.strip())
- if self._handle_td_extract:
- # extract the course name and convert ANNOYING UPPERCASE to Proper Case
- name = []
- for word in data.strip().split():
- if self.roman_regex.match(word):
- name.append(word.upper())
- else:
- name.append(word.capitalize())
- name = " ".join(name)
- # append to list
- self.course_name.append(name)
- # done extracting
- self._handle_td_extract = 0
- # generate the resultant courses list
- self.course_list = tuple(zip(self.course_id, self.course_name, self.course_link))
- # prepare the connection
- cj = http.cookiejar.CookieJar()
- opener = urllib.request.build_opener(
- urllib.request.HTTPCookieProcessor(cj)
- )
- # connect to MMLS Melaka
- print("Connecting to MMLS Melaka ...")
- f = opener.open(site_url_secure)
- parser = LoginParser()
- parser.feed(f.read().decode())
- parser.close()
- f.close()
- # attempt to login
- print()
- mmls_userid = input("User ID: ")
- mmls_password = getpass.getpass("Password: ")
- print()
- params = urllib.parse.urlencode(
- {
- "Submit" : " Login ",
- "hst" : mmls_hst,
- "key" : mmls_key,
- "txtPassword" : mmls_password,
- "txtUserID" : mmls_userid,
- }
- )
- f = opener.open(site_url_secure + "/check_login.php", params)
- # is login successful?
- if f.url == site_url_secure + "/Student/Default/Main.php":
- # login successful
- print("Login successful ... processing course list")
- # feed the content to our parser to generate the course list
- parser = MMLSParser()
- parser.feed(f.read().decode())
- parser.close()
- f.close()
- course_list = parser.course_list
- # prepare our regex
- frame_regex = re.compile(r"sconav_new.php\?t=\d+&lo=no", re.IGNORECASE)
- tree_regex = re.compile(r"/Student/Courses/coursecontent/mynav/course_tree/[A-Z]{3}\d{4}.*_tree\.js", re.IGNORECASE)
- downloadable_regex = re.compile(r"Launch.php\?.*&*path=Contents/.+\.\w{1,4}", re.IGNORECASE)
- filename_regex = re.compile(r"/[\w\s%,.-]+\.\w{1,4}$", re.IGNORECASE)
- # scan for downloadable contents
- print()
- print("Course list acquired ... now scanning for downloadable contents")
- print()
- for course in course_list:
- # load and parse the course page for frame link
- f = opener.open(site_url + course[2])
- page = f.read().decode()
- f.close()
- frame = frame_regex.search(page).group()
- # now load the actual frame and parse for downloadable contents
- f = opener.open(site_url + "/Student/Courses/coursecontent/mynav/" + frame, timeout=5)
- page = f.read().decode()
- f.close()
- tree = tree_regex.search(page).group()
- # finally we can load the tree script and scan for downloadable contents
- f = opener.open(site_url + tree, timeout=5)
- page = f.read().decode()
- f.close()
- downloads = downloadable_regex.findall(page)
- # create a folder to store these files
- directory = "{0} - {1}".format(course[0], course[1])
- if not os.path.exists(directory):
- # mkdir
- os.mkdir(directory)
- # download these files
- for download in downloads:
- # process and generate the download link
- pos = download.index("&path=")
- path = download[pos+6:]
- header = download[:pos]
- path = "/".join([urllib.parse.quote(x) for x in path.split("/")])
- url = site_url + "/Student/Courses/coursecontent/mynav/" + header + "&path=" + path
- # extract the file name
- filename = os.path.join(directory, filename_regex.search(download).group()[1:])
- try:
- # start downloading
- downstream = opener.open(url, timeout=5)
- # do we need to download this file ?
- filesize = int([v for (k, v) in downstream.info()._headers if k == "Content-Length"][0])
- if not os.path.exists(filename):
- # start downloading
- print("{0} ... downloading".format(filename))
- # in and out
- file = open(filename, "wb")
- data = downstream.read()
- file.write(data)
- downstream.close()
- file.close()
- else:
- # updated
- print("{0} ... latest".format(filename))
- # done, cleanup
- downstream.close()
- except UnicodeEncodeError as e:
- # encoding error, no idea what cause it anyway
- pass
- except Exception as e:
- # exception occurred
- print("{0} ... {1}".format(filename, e))
- # done !
- print()
- print("Content synchronized with MMLS Melaka")
- else:
- # failed
- print("Something went wrong!")
- # cleanup
- opener.close()
- # press any key to continue
- input(" ")
advertising
Update the Post
Either update this post and resubmit it with changes, or make a new post.
You may also comment on this post.
Please note that information posted here will expire by default in one month. If you do not want it to expire, please set the expiry time above. If it is set to expire, web search engines will not be allowed to index it prior to it expiring. Items that are not marked to expire will be indexable by search engines. Be careful with your passwords. All illegal activities will be reported and any information will be handed over to the authorities, so be good.