In this project, I wrote a function to scrape and clean data from SFU’s course outline webpages and display it in a table by parsing HTML source codes
The the classes used are: STAT 270,STAT 100, STAT 240, and STAT 203
The websites used are: http://www.sfu.ca/outlines.html?2019/spring/stat/100/d100, https://www.sfu.ca/outlines.html?2019/spring/stat/203/d100, https://www.sfu.ca/outlines.html?2019/spring/stat/240/d100, and https://www.sfu.ca/outlines.html?2019/spring/stat/270/d100
course_url=c("http://www.sfu.ca/outlines.html?2019/spring/stat/100/d100",
"https://www.sfu.ca/outlines.html?2019/spring/stat/203/d100","https://www.sfu.ca/outlines.html?2019/spring/stat/240/d100",
"https://www.sfu.ca/outlines.html?2019/spring/stat/270/d100")
courses=function(course_url){
coursesdf1=NULL
for(i in 1:length(course_url)){
course_page = readLines(course_url[i])
#Delivery method
heading_index=grep("<h3", course_page)
format=gsub("<[^>]+>","",course_page[heading_index]) #remove < to the end >
details=gsub("^\\s+|\\s+$", "", format) #remove spaces
classnum=gsub("[^[:digit:]]","",details[1])
getmethod=regexpr("[[:alpha:]]+\\s[[:alpha:]]+$",details[2])
delivmethod=regmatches(details[2],getmethod)
#Course number
heading_index2=grep("<h1", course_page)
format=gsub("<[^>]+>","",course_page[heading_index2][2])
format=gsub("^\\s+|\\s+$", "", format)
get=regexpr("[[:upper:]]{2,}\\s[[:digit:]]+\\b",format)
coursenum=regmatches(format,get)
#Course title
heading_index3=grep('<h2 id="title">', course_page)
index1=course_page[(heading_index3[length(heading_index3)]):(heading_index3[length(heading_index3)]+1)]
coursetitle=gsub("^\\s+","",index1[2])
#Name of the course instructor
heading_index4=grep("<h4>Instructor:</h4>", course_page)
index2=course_page[(heading_index4[length(heading_index4)]-1):(heading_index4[length(heading_index4)]+1)]
format2=gsub("<[^>]+>","",index2)
instructorname=gsub("^\\s+","",format2[3])
#Class times
heading_index5=grepl("</h4>|</p>", course_page)
format3=gsub("<[^>]+>"," ",course_page[heading_index5][2])
format3=gsub("(^\\s+| $)","",format3)
classtimes=gsub(" [[:punct:]][[:alnum:]]+[[:punct:]]", "",format3)
classtimesreadable=gsub("[[:digit:]]+\\s(PM)\\s[[:digit:]]", "[[:digit:]]+\\s(PM to)\\s[[:digit:]]",classtimes)
#Name of textbook
heading_index6=grep("READING", course_page)
format4=gsub("<[^>]+>"," ",course_page[(heading_index6):(heading_index6+5)])
format4=gsub("(^\\s+| $)","",format4[5])
format4=gsub("[&]+[[:alnum:]]{3,}[;]","",format4)
textbook=gsub("(\\s{2,})"," ",format4)
#Exam times
heading_index7=grep("Exam Times",course_page)
index7=course_page[(heading_index7[length(heading_index7)]):(heading_index7[length(heading_index7)]+7)]
format5=gsub("<[^>]+>"," ",index7[-c(2,5,6)])
format5=gsub("(^\\s+| $)","",format5)
format5=gsub(" [[:punct:]][[:alnum:]]+[[:punct:]]", "",format5)
gettimes=regexpr(".+[PM]\\b", format5)
times=regmatches(format5,gettimes)
getplace=regexpr("[[:upper:]]+\\s[[:digit:]]{4,}(\\s|[[:punct:]])+[[:alnum:]]+$", format5)
place=regmatches(format5,getplace)
examtimes=c(format5[1:2],times[1],place[1],format5[4],times[2],place[2])
exam=paste0(as.vector(na.omit(examtimes[3:7])),collapse=" ")
coursesdf=data.frame(Class.Number=classnum,
Delivery.Method=delivmethod,
Course.Name.And.Number=coursenum,
Title=coursetitle,
Instructor=instructorname,
Course.Times.and.Locations=classtimes,
Textbook=textbook,
Exam.Time.and.Location=exam)
coursesdf1=rbind(coursesdf1,coursesdf)
}
return(coursesdf1)
}
courses(course_url)
LS0tCnRpdGxlOiAiU0ZVIENvdXJzZSBPdXRsaW5lIFdlYnNjcmFwZXIiCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KCmBgYHtyIHNldHVwLCBpbmNsdWRlPUZBTFNFfQprbml0cjo6b3B0c19jaHVuayRzZXQoZWNobyA9IFRSVUUpCmBgYAoKIwpJbiB0aGlzIHByb2plY3QsIEkgd3JvdGUgYSBmdW5jdGlvbiB0byBzY3JhcGUgYW5kIGNsZWFuIGRhdGEgZnJvbSBTRlUncyBjb3Vyc2Ugb3V0bGluZSB3ZWJwYWdlcyBhbmQgZGlzcGxheSBpdCBpbiBhIHRhYmxlIGJ5IHBhcnNpbmcgSFRNTCBzb3VyY2UgY29kZXMKClRoZSB0aGUgY2xhc3NlcyB1c2VkIGFyZTogU1RBVCAyNzAsU1RBVCAxMDAsIFNUQVQgMjQwLCBhbmQgU1RBVCAyMDMgCgpUaGUgd2Vic2l0ZXMgdXNlZCBhcmU6Cmh0dHA6Ly93d3cuc2Z1LmNhL291dGxpbmVzLmh0bWw/MjAxOS9zcHJpbmcvc3RhdC8xMDAvZDEwMCwgaHR0cHM6Ly93d3cuc2Z1LmNhL291dGxpbmVzLmh0bWw/MjAxOS9zcHJpbmcvc3RhdC8yMDMvZDEwMCwgaHR0cHM6Ly93d3cuc2Z1LmNhL291dGxpbmVzLmh0bWw/MjAxOS9zcHJpbmcvc3RhdC8yNDAvZDEwMCwgYW5kIGh0dHBzOi8vd3d3LnNmdS5jYS9vdXRsaW5lcy5odG1sPzIwMTkvc3ByaW5nL3N0YXQvMjcwL2QxMDAKCmBgYHtyfQpjb3Vyc2VfdXJsPWMoImh0dHA6Ly93d3cuc2Z1LmNhL291dGxpbmVzLmh0bWw/MjAxOS9zcHJpbmcvc3RhdC8xMDAvZDEwMCIsCiJodHRwczovL3d3dy5zZnUuY2Evb3V0bGluZXMuaHRtbD8yMDE5L3NwcmluZy9zdGF0LzIwMy9kMTAwIiwiaHR0cHM6Ly93d3cuc2Z1LmNhL291dGxpbmVzLmh0bWw/MjAxOS9zcHJpbmcvc3RhdC8yNDAvZDEwMCIsCiJodHRwczovL3d3dy5zZnUuY2Evb3V0bGluZXMuaHRtbD8yMDE5L3NwcmluZy9zdGF0LzI3MC9kMTAwIikKCmNvdXJzZXM9ZnVuY3Rpb24oY291cnNlX3VybCl7CiAgY291cnNlc2RmMT1OVUxMCiAgZm9yKGkgaW4gMTpsZW5ndGgoY291cnNlX3VybCkpewpjb3Vyc2VfcGFnZSA9IHJlYWRMaW5lcyhjb3Vyc2VfdXJsW2ldKQoKI0RlbGl2ZXJ5IG1ldGhvZApoZWFkaW5nX2luZGV4PWdyZXAoIjxoMyIsIGNvdXJzZV9wYWdlKQpmb3JtYXQ9Z3N1YigiPFtePl0rPiIsIiIsY291cnNlX3BhZ2VbaGVhZGluZ19pbmRleF0pICNyZW1vdmUgPCB0byB0aGUgZW5kID4KZGV0YWlscz1nc3ViKCJeXFxzK3xcXHMrJCIsICIiLCBmb3JtYXQpICNyZW1vdmUgc3BhY2VzCmNsYXNzbnVtPWdzdWIoIlteWzpkaWdpdDpdXSIsIiIsZGV0YWlsc1sxXSkKZ2V0bWV0aG9kPXJlZ2V4cHIoIltbOmFscGhhOl1dK1xcc1tbOmFscGhhOl1dKyQiLGRldGFpbHNbMl0pCmRlbGl2bWV0aG9kPXJlZ21hdGNoZXMoZGV0YWlsc1syXSxnZXRtZXRob2QpCgojQ291cnNlIG51bWJlcgpoZWFkaW5nX2luZGV4Mj1ncmVwKCI8aDEiLCBjb3Vyc2VfcGFnZSkKZm9ybWF0PWdzdWIoIjxbXj5dKz4iLCIiLGNvdXJzZV9wYWdlW2hlYWRpbmdfaW5kZXgyXVsyXSkgCmZvcm1hdD1nc3ViKCJeXFxzK3xcXHMrJCIsICIiLCBmb3JtYXQpCmdldD1yZWdleHByKCJbWzp1cHBlcjpdXXsyLH1cXHNbWzpkaWdpdDpdXStcXGIiLGZvcm1hdCkKY291cnNlbnVtPXJlZ21hdGNoZXMoZm9ybWF0LGdldCkKCiNDb3Vyc2UgdGl0bGUKaGVhZGluZ19pbmRleDM9Z3JlcCgnPGgyIGlkPSJ0aXRsZSI+JywgY291cnNlX3BhZ2UpCmluZGV4MT1jb3Vyc2VfcGFnZVsoaGVhZGluZ19pbmRleDNbbGVuZ3RoKGhlYWRpbmdfaW5kZXgzKV0pOihoZWFkaW5nX2luZGV4M1tsZW5ndGgoaGVhZGluZ19pbmRleDMpXSsxKV0KY291cnNldGl0bGU9Z3N1YigiXlxccysiLCIiLGluZGV4MVsyXSkKCiNOYW1lIG9mIHRoZSBjb3Vyc2UgaW5zdHJ1Y3RvcgpoZWFkaW5nX2luZGV4ND1ncmVwKCI8aDQ+SW5zdHJ1Y3Rvcjo8L2g0PiIsIGNvdXJzZV9wYWdlKQppbmRleDI9Y291cnNlX3BhZ2VbKGhlYWRpbmdfaW5kZXg0W2xlbmd0aChoZWFkaW5nX2luZGV4NCldLTEpOihoZWFkaW5nX2luZGV4NFtsZW5ndGgoaGVhZGluZ19pbmRleDQpXSsxKV0KZm9ybWF0Mj1nc3ViKCI8W14+XSs+IiwiIixpbmRleDIpCmluc3RydWN0b3JuYW1lPWdzdWIoIl5cXHMrIiwiIixmb3JtYXQyWzNdKQoKI0NsYXNzIHRpbWVzCmhlYWRpbmdfaW5kZXg1PWdyZXBsKCI8L2g0Pnw8L3A+IiwgY291cnNlX3BhZ2UpCmZvcm1hdDM9Z3N1YigiPFtePl0rPiIsIiAiLGNvdXJzZV9wYWdlW2hlYWRpbmdfaW5kZXg1XVsyXSkKZm9ybWF0Mz1nc3ViKCIoXlxccyt8ICQpIiwiIixmb3JtYXQzKQpjbGFzc3RpbWVzPWdzdWIoIiBbWzpwdW5jdDpdXVtbOmFsbnVtOl1dK1tbOnB1bmN0Ol1dIiwgIiIsZm9ybWF0MykKY2xhc3N0aW1lc3JlYWRhYmxlPWdzdWIoIltbOmRpZ2l0Ol1dK1xccyhQTSlcXHNbWzpkaWdpdDpdXSIsICJbWzpkaWdpdDpdXStcXHMoUE0gdG8pXFxzW1s6ZGlnaXQ6XV0iLGNsYXNzdGltZXMpCgojTmFtZSBvZiB0ZXh0Ym9vawpoZWFkaW5nX2luZGV4Nj1ncmVwKCJSRUFESU5HIiwgY291cnNlX3BhZ2UpCmZvcm1hdDQ9Z3N1YigiPFtePl0rPiIsIiAiLGNvdXJzZV9wYWdlWyhoZWFkaW5nX2luZGV4Nik6KGhlYWRpbmdfaW5kZXg2KzUpXSkKZm9ybWF0ND1nc3ViKCIoXlxccyt8ICQpIiwiIixmb3JtYXQ0WzVdKQpmb3JtYXQ0PWdzdWIoIlsmXStbWzphbG51bTpdXXszLH1bO10iLCIiLGZvcm1hdDQpCnRleHRib29rPWdzdWIoIihcXHN7Mix9KSIsIiAiLGZvcm1hdDQpCgojRXhhbSB0aW1lcwpoZWFkaW5nX2luZGV4Nz1ncmVwKCJFeGFtIFRpbWVzIixjb3Vyc2VfcGFnZSkKaW5kZXg3PWNvdXJzZV9wYWdlWyhoZWFkaW5nX2luZGV4N1tsZW5ndGgoaGVhZGluZ19pbmRleDcpXSk6KGhlYWRpbmdfaW5kZXg3W2xlbmd0aChoZWFkaW5nX2luZGV4NyldKzcpXQpmb3JtYXQ1PWdzdWIoIjxbXj5dKz4iLCIgIixpbmRleDdbLWMoMiw1LDYpXSkKZm9ybWF0NT1nc3ViKCIoXlxccyt8ICQpIiwiIixmb3JtYXQ1KQpmb3JtYXQ1PWdzdWIoIiBbWzpwdW5jdDpdXVtbOmFsbnVtOl1dK1tbOnB1bmN0Ol1dIiwgIiIsZm9ybWF0NSkKZ2V0dGltZXM9cmVnZXhwcigiLitbUE1dXFxiIiwgZm9ybWF0NSkKdGltZXM9cmVnbWF0Y2hlcyhmb3JtYXQ1LGdldHRpbWVzKQpnZXRwbGFjZT1yZWdleHByKCJbWzp1cHBlcjpdXStcXHNbWzpkaWdpdDpdXXs0LH0oXFxzfFtbOnB1bmN0Ol1dKStbWzphbG51bTpdXSskIiwgZm9ybWF0NSkKcGxhY2U9cmVnbWF0Y2hlcyhmb3JtYXQ1LGdldHBsYWNlKQpleGFtdGltZXM9Yyhmb3JtYXQ1WzE6Ml0sdGltZXNbMV0scGxhY2VbMV0sZm9ybWF0NVs0XSx0aW1lc1syXSxwbGFjZVsyXSkKZXhhbT1wYXN0ZTAoYXMudmVjdG9yKG5hLm9taXQoZXhhbXRpbWVzWzM6N10pKSxjb2xsYXBzZT0iICIpCgoKY291cnNlc2RmPWRhdGEuZnJhbWUoQ2xhc3MuTnVtYmVyPWNsYXNzbnVtLAogICAgICAgICAgICAgICAgICAgICBEZWxpdmVyeS5NZXRob2Q9ZGVsaXZtZXRob2QsCiAgICAgICAgICAgICAgICAgICAgIENvdXJzZS5OYW1lLkFuZC5OdW1iZXI9Y291cnNlbnVtLAogICAgICAgICAgICAgICAgICAgICBUaXRsZT1jb3Vyc2V0aXRsZSwgCiAgICAgICAgICAgICAgICAgICAgIEluc3RydWN0b3I9aW5zdHJ1Y3Rvcm5hbWUsIAogICAgICAgICAgICAgICAgICAgICBDb3Vyc2UuVGltZXMuYW5kLkxvY2F0aW9ucz1jbGFzc3RpbWVzLCAKICAgICAgICAgICAgICAgICAgICAgVGV4dGJvb2s9dGV4dGJvb2ssIAogICAgICAgICAgICAgICAgICAgICBFeGFtLlRpbWUuYW5kLkxvY2F0aW9uPWV4YW0pCgpjb3Vyc2VzZGYxPXJiaW5kKGNvdXJzZXNkZjEsY291cnNlc2RmKQp9CnJldHVybihjb3Vyc2VzZGYxKQp9Cgpjb3Vyc2VzKGNvdXJzZV91cmwpCmBgYAoK