Part of Slepp's ProjectsPastebinTURLImagebinFilebin
Feedback -- English French German Japanese
Create Upload Newest Tools Donate
Sign In | Create Account

mmls-sync
Tuesday, February 9th, 2010 at 11:43:50am MST 

  1. import re
  2. import os
  3. import os.path
  4. import getpass
  5. import urllib.request
  6. import urllib.parse
  7. import http.cookiejar
  8. import html.parser
  9.  
  10. # define some global variables
  11. site_hostname = "mmlsmelaka.mmu.edu.my"
  12. site_url = "http://" + site_hostname
  13. site_url_secure = "https://" + site_hostname
  14.  
  15. class LoginParser(html.parser.HTMLParser):
  16.    
  17.     def handle_starttag(self, tag, attrs):
  18.        
  19.         global mmls_hst, mmls_key
  20.        
  21.         if tag == "input":
  22.            
  23.             name = [v for k, v in attrs if k == "name"][0]
  24.             value = ([v for k, v in attrs if k == "value"] or [""])[0]
  25.            
  26.             if name == "hst":
  27.                 mmls_hst = value
  28.             elif name == "key":
  29.                 mmls_key = value
  30.            
  31.  
  32. class MMLSParser(html.parser.HTMLParser):
  33.    
  34.     # prepare our regex
  35.     course_regex = re.compile(r"[A-Z]{3}\d{4}")
  36.     link_regex = re.compile(r"/Student/Courses/coursecontent/courses\.php\?crskid=\w{51}&crdid=\d{10}", re.IGNORECASE)
  37.    
  38.     roman_regex = re.compile(r"^I{1,3}$", re.IGNORECASE)
  39.    
  40.     # some local variables
  41.     _handle_a = 0
  42.     _handle_td = 0
  43.     _handle_td_counter = 0
  44.     _handle_td_extract = 0
  45.    
  46.     # lists
  47.     course_id = []
  48.     course_name = []
  49.     course_link = []
  50.    
  51.     def handle_starttag(self, tag, attrs):
  52.        
  53.         if tag == "a":
  54.            
  55.             href = [v for k, v in attrs if k == "href"][0]
  56.             if href and self.link_regex.match(href):
  57.                 # add this link to our course link list
  58.                 self.course_link.append(href)
  59.                 self._handle_a = 1
  60.                 self._handle_td = 1
  61.        
  62.         if self._handle_td and tag == "td":
  63.            
  64.             if self._handle_td_counter >= 2:
  65.                 # reset flag and counter
  66.                 self._handle_td = 0
  67.                 self._handle_td_counter = 0
  68.                 self._handle_td_extract = 1
  69.             else:
  70.                 # increment counter
  71.                 self._handle_td_counter += 1
  72.                
  73.    
  74.     def handle_data(self, data):
  75.        
  76.         if self._handle_a:
  77.             # append this to our course name
  78.             self._handle_a = 0
  79.             self.course_id.append(data.strip())
  80.            
  81.         if self._handle_td_extract:
  82.             # extract the course name and convert ANNOYING UPPERCASE to Proper Case
  83.             name = []
  84.             for word in data.strip().split():
  85.                 if self.roman_regex.match(word):
  86.                     name.append(word.upper())
  87.                 else:
  88.                     name.append(word.capitalize())
  89.                
  90.             name = " ".join(name)
  91.            
  92.             # append to list
  93.             self.course_name.append(name)
  94.            
  95.             # done extracting
  96.             self._handle_td_extract = 0
  97.            
  98.             # generate the resultant courses list
  99.             self.course_list = tuple(zip(self.course_id, self.course_name, self.course_link))
  100.            
  101.  
  102.  
  103.  
  104. # prepare the connection
  105. cj = http.cookiejar.CookieJar()
  106. opener = urllib.request.build_opener(
  107.     urllib.request.HTTPCookieProcessor(cj)
  108. )
  109.  
  110. # connect to MMLS Melaka
  111. print("Connecting to MMLS Melaka ...")
  112. f = opener.open(site_url_secure)
  113.  
  114. parser = LoginParser()
  115. parser.feed(f.read().decode())
  116. parser.close()
  117.  
  118. f.close()
  119.  
  120. # attempt to login
  121. print()
  122. mmls_userid = input("User ID: ")
  123. mmls_password = getpass.getpass("Password: ")
  124. print()
  125.  
  126. params = urllib.parse.urlencode(
  127.     {
  128.         "Submit"        : " Login ",
  129.         "hst"           : mmls_hst,
  130.         "key"           : mmls_key,
  131.         "txtPassword"   : mmls_password,
  132.         "txtUserID"     : mmls_userid,
  133.     }
  134. )
  135. f = opener.open(site_url_secure + "/check_login.php", params)
  136.  
  137. # is login successful?
  138. if f.url == site_url_secure + "/Student/Default/Main.php":
  139.     # login successful
  140.     print("Login successful ... processing course list")
  141.    
  142.     # feed the content to our parser to generate the course list
  143.     parser = MMLSParser()
  144.     parser.feed(f.read().decode())
  145.     parser.close()
  146.     f.close()
  147.     course_list = parser.course_list
  148.    
  149.     # prepare our regex
  150.     frame_regex = re.compile(r"sconav_new.php\?t=\d+&lo=no", re.IGNORECASE)
  151.     tree_regex = re.compile(r"/Student/Courses/coursecontent/mynav/course_tree/[A-Z]{3}\d{4}.*_tree\.js", re.IGNORECASE)
  152.     downloadable_regex = re.compile(r"Launch.php\?.*&*path=Contents/.+\.\w{1,4}", re.IGNORECASE)
  153.     filename_regex = re.compile(r"/[\w\s%,.-]+\.\w{1,4}$", re.IGNORECASE)
  154.    
  155.     # scan for downloadable contents
  156.     print()
  157.     print("Course list acquired ... now scanning for downloadable contents")
  158.     print()
  159.     for course in course_list:
  160.         # load and parse the course page for frame link
  161.         f = opener.open(site_url + course[2])
  162.         page = f.read().decode()
  163.         f.close()
  164.         frame = frame_regex.search(page).group()
  165.        
  166.         # now load the actual frame and parse for downloadable contents
  167.         f = opener.open(site_url + "/Student/Courses/coursecontent/mynav/" + frame, timeout=5)
  168.         page = f.read().decode()
  169.         f.close()
  170.         tree = tree_regex.search(page).group()
  171.        
  172.         # finally we can load the tree script and scan for downloadable contents
  173.         f = opener.open(site_url + tree, timeout=5)
  174.         page = f.read().decode()
  175.         f.close()
  176.         downloads = downloadable_regex.findall(page)
  177.        
  178.         # create a folder to store these files
  179.         directory = "{0} - {1}".format(course[0], course[1])
  180.         if not os.path.exists(directory):
  181.             # mkdir
  182.             os.mkdir(directory)
  183.        
  184.         # download these files
  185.         for download in downloads:
  186.             # process and generate the download link
  187.             pos = download.index("&path=")
  188.             path = download[pos+6:]
  189.             header = download[:pos]
  190.             path = "/".join([urllib.parse.quote(x) for x in path.split("/")])
  191.             url = site_url + "/Student/Courses/coursecontent/mynav/" + header + "&path=" + path
  192.            
  193.             # extract the file name
  194.             filename = os.path.join(directory, filename_regex.search(download).group()[1:])
  195.            
  196.             try:
  197.                 # start downloading
  198.                 downstream = opener.open(url, timeout=5)
  199.                
  200.                 # do we need to download this file ?
  201.                 filesize = int([v for (k, v) in downstream.info()._headers if k == "Content-Length"][0])
  202.                 if not os.path.exists(filename):
  203.                     # start downloading
  204.                     print("{0} ... downloading".format(filename))
  205.                    
  206.                     # in and out
  207.                     file = open(filename, "wb")
  208.                     data = downstream.read()
  209.                     file.write(data)
  210.                     downstream.close()
  211.                     file.close()
  212.                    
  213.                 else:
  214.                     # updated
  215.                     print("{0} ... latest".format(filename))
  216.                
  217.                 # done, cleanup
  218.                 downstream.close()
  219.                
  220.             except UnicodeEncodeError as e:
  221.                 # encoding error, no idea what cause it anyway
  222.                 pass
  223.                
  224.             except Exception as e:
  225.                 # exception occurred
  226.                 print("{0} ... {1}".format(filename, e))
  227.                
  228.    
  229.     # done !
  230.     print()
  231.     print("Content synchronized with MMLS Melaka")
  232.  
  233. else:
  234.     # failed
  235.     print("Something went wrong!")
  236.    
  237. # cleanup
  238. opener.close()
  239.  
  240. # press any key to continue
  241. input(" ")

advertising

Update the Post

Either update this post and resubmit it with changes, or make a new post.

You may also comment on this post.

update paste below
details of the post (optional)

Note: Only the paste content is required, though the following information can be useful to others.

Save name / title?

(space separated, optional)



Please note that information posted here will expire by default in one month. If you do not want it to expire, please set the expiry time above. If it is set to expire, web search engines will not be allowed to index it prior to it expiring. Items that are not marked to expire will be indexable by search engines. Be careful with your passwords. All illegal activities will be reported and any information will be handed over to the authorities, so be good.

worth-right
fantasy-obligation