Each tab represents different ways of loading data into R and the variety of file types (txt, XML, JSON, HTML, API, Neo4j, MySQL) R is compatible with.
Use readLines function to load txt file
## [1] "-----------------------------------------------------------------------------------------"
## [2] " Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round| "
## [3] " Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 | "
## [4] "-----------------------------------------------------------------------------------------"
## [5] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [6] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [7] "-----------------------------------------------------------------------------------------"
## [8] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [9] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [10] "-----------------------------------------------------------------------------------------"
As seen above, every third row starting with row 5 contains the chess player names and every third row starting on row 6 contains states and ratings.
Player Name
#every third row starting on row 5
player_names <- seq(5, 196, 3)
names_and_rounds <- Chess_RawData[player_names]
head(names_and_rounds)## [1] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [2] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [3] " 3 | ADITYA BAJAJ |6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W 12|"
## [4] " 4 | PATRICK H SCHILLING |5.5 |W 23|D 28|W 2|W 26|D 5|W 19|D 1|"
## [5] " 5 | HANSHI ZUO |5.5 |W 45|W 37|D 12|D 13|D 4|W 14|W 17|"
## [6] " 6 | HANSEN SONG |5.0 |W 34|D 29|L 11|W 35|D 10|W 27|W 21|"
State & Ratings
#every third row starting on row 6
st_rate <- seq(6, 196, 3)
state_and_ratings <- Chess_RawData[st_rate]
head(state_and_ratings)## [1] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [2] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [3] " MI | 14959604 / R: 1384 ->1640 |N:2 |W |B |W |B |W |B |W |"
## [4] " MI | 12616049 / R: 1716 ->1744 |N:2 |W |B |W |B |W |B |B |"
## [5] " MI | 14601533 / R: 1655 ->1690 |N:2 |B |W |B |W |B |W |B |"
## [6] " OH | 15055204 / R: 1686 ->1687 |N:3 |W |B |W |B |B |W |B |"
Extraction
Names
extracted_names <- unlist(str_extract_all(names_and_rounds,
"\\w+\\s\\w+\\s\\w+\\s\\w+\\s|\\w+\\s\\w+\\s\\w+|\\w+\\s\\w+"))
head(extracted_names)## [1] "GARY HUA" "DAKSHESH DARURI" "ADITYA BAJAJ"
## [4] "PATRICK H SCHILLING" "HANSHI ZUO" "HANSEN SONG"
States
extracted_states <- unlist(str_extract_all(state_and_ratings, "\\s\\w{2}\\s"))
head(extracted_states)## [1] " ON " " MI " " MI " " MI " " MI " " OH "
Pre Ratings
preratings <- unlist(str_extract_all(state_and_ratings, "[:]\\s+[[:digit:]]+"))
preratings <- unlist(str_extract_all(preratings, "[[:digit:]]+"))
head(preratings)## [1] "1794" "1553" "1384" "1716" "1655" "1686"
Post Ratings
postratings <- unlist(str_extract_all(state_and_ratings, "[>]\\s*[[:digit:]]+"))
postratings <- unlist(str_extract_all(postratings, "[[:digit:]]+"))
head(postratings)## [1] "1817" "1663" "1640" "1744" "1690" "1687"
Rounds
allrounds <- unlist(str_extract_all(names_and_rounds, "\\s?[[:print:]]{1,2}[|][[:alpha:]]?"))
round1 <- allrounds[seq(4, length(allrounds), 10)]
round2 <- allrounds[seq(5, length(allrounds), 10)]
round3 <- allrounds[seq(6, length(allrounds), 10)]
round4 <- allrounds[seq(7, length(allrounds), 10)]
round5 <- allrounds[seq(8, length(allrounds), 10)]
round6 <- allrounds[seq(9, length(allrounds), 10)]
round7 <- allrounds[seq(10, length(allrounds), 10)]
round1 <- unlist(str_extract_all(round1, "[[:print:]]+[^[|]][^[:alpha:]_]"))
round1 <- unlist(str_extract_all(round1, "[[:print:]]+[^[|]]"))
round2 <- unlist(str_extract_all(round2, "[[:print:]]+[^[|]][^[:alpha:]_]"))
round2 <- unlist(str_extract_all(round2, "[[:print:]]+[^[|]]"))
round3 <- unlist(str_extract_all(round3, "[[:print:]]+[^[|]][^[:alpha:]_]"))
round3 <- unlist(str_extract_all(round3, "[[:print:]]+[^[|]]"))
round4 <- unlist(str_extract_all(round4, "[[:print:]]+[^[|]][^[:alpha:]_]"))
round4 <- unlist(str_extract_all(round4, "[[:print:]]+[^[|]]"))
round5 <- unlist(str_extract_all(round5, "[[:print:]]+[^[|]][^[:alpha:]_]"))
round5 <- unlist(str_extract_all(round5, "[[:print:]]+[^[|]]"))
round6 <- unlist(str_extract_all(round6, "[[:print:]]+[^[|]][^[:alpha:]_]"))
round6 <- unlist(str_extract_all(round6, "[[:print:]]+[^[|]]"))
round7 <- unlist(str_extract_all(round7, "[[:print:]]+[^[|]]"))
head (round1)## [1] " 39" " 63" " 8" " 23" " 45" " 34"
id <- seq(1, 64, 1)
Chess_No_Avg<- data.frame(id, extracted_names, extracted_states,TableChess$Total,
"Pre Rating" = as.numeric(preratings),
"Post Rating" = as.numeric(postratings),
"Round 1" = as.numeric(round1), "Round 2" = as.numeric(round2),
"Round 3" = as.numeric(round3),
"Round 4" = as.numeric(round4), "Round 5" = as.numeric(round5),
"Round 6" = as.numeric(round6), "Round 7" = as.numeric(round7))
#Column Names
colnames(Chess_No_Avg) <- c ("Id", "Player Name", "State","Total Points", "Pre Rating",
"Post Rating", "Round 1","Round 2","Round 3","Round 4",
"Round 5","Round 6","Round 7")
glimpse(Chess_No_Avg)## Observations: 64
## Variables: 13
## $ Id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,…
## $ `Player Name` <fct> GARY HUA, DAKSHESH DARURI, ADITYA BAJAJ, PATRICK H SCH…
## $ State <fct> ON , MI , MI , MI , MI , OH , MI , MI , ON , …
## $ `Total Points` <dbl> 6.0, 6.0, 6.0, 5.5, 5.5, 5.0, 5.0, 5.0, 5.0, 5.0, 4.5,…
## $ `Pre Rating` <dbl> 1794, 1553, 1384, 1716, 1655, 1686, 1649, 1641, 1411, …
## $ `Post Rating` <dbl> 1817, 1663, 1640, 1744, 1690, 1687, 1673, 1657, 1564, …
## $ `Round 1` <dbl> 39, 63, 8, 23, 45, 34, 57, 3, 25, 16, 38, 42, 36, 54, …
## $ `Round 2` <dbl> 21, 58, 61, 28, 37, 29, 46, 32, 18, 19, 56, 33, 27, 44…
## $ `Round 3` <dbl> 18, 4, 25, 2, 12, 11, 13, 14, 59, 55, 6, 5, 7, 8, 30, …
## $ `Round 4` <dbl> 14, 17, 21, 26, 13, 35, 11, 9, 8, 31, 7, 38, 5, 1, 22,…
## $ `Round 5` <dbl> 7, 16, 11, 5, 4, 10, 1, 47, 26, 6, 3, NA, 33, 27, 54, …
## $ `Round 6` <dbl> 12, 20, 13, 19, 14, 27, 9, 28, 7, 25, 34, 1, 3, 5, 33,…
## $ `Round 7` <dbl> 4, 7, 12, 1, 17, 21, 2, 19, 20, 18, 26, 3, 32, 31, 38,…
Loop Used to Calculate Average Pre Chess Rating of Opponent
Avg_PreRating <- c()
for (i in Chess_No_Avg$Id){
a <- Chess_No_Avg[Chess_No_Avg$'Round 1'==i, "Pre Rating"]
a <- a[!is.na(a)]
b <- Chess_No_Avg[Chess_No_Avg$'Round 2'==i, "Pre Rating"]
b <- b[!is.na(b)]
c <- Chess_No_Avg[Chess_No_Avg$'Round 3'==i, "Pre Rating"]
c <- c[!is.na(c)]
d <- Chess_No_Avg[Chess_No_Avg$'Round 4'==i, "Pre Rating"]
d <- d[!is.na(d)]
e <- Chess_No_Avg[Chess_No_Avg$'Round 5'==i, "Pre Rating"]
e <- e[!is.na(e)]
f <- Chess_No_Avg[Chess_No_Avg$'Round 6'==i, "Pre Rating"]
f <- f[!is.na(f)]
g <- Chess_No_Avg[Chess_No_Avg$'Round 7'==i, "Pre Rating"]
g <- g[!is.na(g)]
avg_rate <- mean(c(a, b, c, d, e, f, g))
Avg_PreRating <- c(Avg_PreRating, avg_rate)}#add calculations from above loop to initial Data Frame
Chess_No_Avg ["Opponent Pre Rating"] <- Avg_PreRating
FinalDataCSV <- subset(Chess_No_Avg, select = c ("Id", "Player Name", "State",
"Pre Rating", "Opponent Pre Rating"))
FinalDataCSV %>%
kable() %>%
kable_styling()| Id | Player Name | State | Pre Rating | Opponent Pre Rating |
|---|---|---|---|---|
| 1 | GARY HUA | ON | 1794 | 1605.286 |
| 2 | DAKSHESH DARURI | MI | 1553 | 1469.286 |
| 3 | ADITYA BAJAJ | MI | 1384 | 1563.571 |
| 4 | PATRICK H SCHILLING | MI | 1716 | 1573.571 |
| 5 | HANSHI ZUO | MI | 1655 | 1500.857 |
| 6 | HANSEN SONG | OH | 1686 | 1518.714 |
| 7 | GARY DEE SWATHELL | MI | 1649 | 1372.143 |
| 8 | EZEKIEL HOUGHTON | MI | 1641 | 1468.429 |
| 9 | STEFANO LEE | ON | 1411 | 1523.143 |
| 10 | ANVIT RAO | MI | 1365 | 1554.143 |
| 11 | CAMERON WILLIAM MC LEMAN | MI | 1712 | 1467.571 |
| 12 | KENNETH J TACK | MI | 1663 | 1506.167 |
| 13 | TORRANCE HENRY JR | MI | 1666 | 1497.857 |
| 14 | BRADLEY SHAW | MI | 1610 | 1515.000 |
| 15 | ZACHARY JAMES HOUGHTON | MI | 1220 | 1483.857 |
| 16 | MIKE NIKITIN | MI | 1604 | 1385.800 |
| 17 | RONALD GRZEGORCZYK | MI | 1629 | 1498.571 |
| 18 | DAVID SUNDEEN | MI | 1600 | 1480.000 |
| 19 | DIPANKAR ROY | MI | 1564 | 1426.286 |
| 20 | JASON ZHENG | MI | 1595 | 1410.857 |
| 21 | DINH DANG BUI | ON | 1563 | 1470.429 |
| 22 | EUGENE L MCCLURE | MI | 1555 | 1300.333 |
| 23 | ALAN BUI | ON | 1363 | 1213.857 |
| 24 | MICHAEL R ALDRICH | MI | 1229 | 1357.000 |
| 25 | LOREN SCHWIEBERT | MI | 1745 | 1363.286 |
| 26 | MAX ZHU | ON | 1579 | 1506.857 |
| 27 | GAURAV GIDWANI | MI | 1552 | 1221.667 |
| 28 | SOFIA ADINA STANESCU | MI | 1507 | 1522.143 |
| 29 | CHIEDOZIE OKORIE | MI | 1602 | 1313.500 |
| 30 | GEORGE AVERY JONES | ON | 1522 | 1144.143 |
| 31 | RISHI SHETTY | MI | 1494 | 1259.857 |
| 32 | JOSHUA PHILIP MATHEWS | ON | 1441 | 1378.714 |
| 33 | JADE GE | MI | 1449 | 1276.857 |
| 34 | MICHAEL JEFFERY THOMAS | MI | 1399 | 1375.286 |
| 35 | JOSHUA DAVID LEE | MI | 1438 | 1149.714 |
| 36 | SIDDHARTH JHA | MI | 1355 | 1388.167 |
| 37 | AMIYATOSH PWNANANDAM | MI | 980 | 1384.800 |
| 38 | BRIAN LIU | MI | 1423 | 1539.167 |
| 39 | JOEL R HENDON | MI | 1436 | 1429.571 |
| 40 | FOREST ZHANG | MI | 1348 | 1390.571 |
| 41 | KYLE WILLIAM MURPHY | MI | 1403 | 1248.500 |
| 42 | JARED GE | MI | 1332 | 1149.857 |
| 43 | ROBERT GLEN VASEY | MI | 1283 | 1106.571 |
| 44 | JUSTIN D SCHILLING | MI | 1199 | 1327.000 |
| 45 | DEREK YAN | MI | 1242 | 1152.000 |
| 46 | JACOB ALEXANDER LAVALLEY | MI | 377 | 1357.714 |
| 47 | ERIC WRIGHT | MI | 1362 | 1392.000 |
| 48 | DANIEL KHAIN | MI | 1382 | 1355.800 |
| 49 | MICHAEL J MARTIN | MI | 1291 | 1285.800 |
| 50 | SHIVAM JHA | MI | 1056 | 1296.000 |
| 51 | TEJAS AYYAGARI | MI | 1011 | 1356.143 |
| 52 | ETHAN GUO | MI | 935 | 1494.571 |
| 53 | JOSE C YBARRA | MI | 1393 | 1345.333 |
| 54 | LARRY HODGE | MI | 1270 | 1206.167 |
| 55 | ALEX KONG | MI | 1186 | 1406.000 |
| 56 | MARISA RICCI | MI | 1153 | 1414.400 |
| 57 | MICHAEL LU | MI | 1092 | 1363.000 |
| 58 | VIRAJ MOHILE | MI | 917 | 1391.000 |
| 59 | SEAN M MC CORMICK | MI | 853 | 1319.000 |
| 60 | JULIA SHEN | MI | 967 | 1330.200 |
| 61 | JEZZEL FARKAS | ON | 955 | 1327.286 |
| 62 | ASHWIN BALAJI | MI | 1530 | 1186.000 |
| 63 | THOMAS JOSEPH HOSMER | MI | 1175 | 1350.200 |
| 64 | BEN LI | MI | 1163 | 1263.000 |
In case a user is more comfortable with Microsoft Excel, we can save the final dataframe into a csv file with the following code:
library("XML")
library("RCurl")
books_xml_url <- "https://raw.githubusercontent.com/aaronzal95/data607/master/books.xml"
#RCurl getURL
books_xml <- getURL(books_xml_url)Using XML xmlParse function, xmlRoot function and xmlToDataFrame function to transfer books data from xml to R dataframe
books_parse <- xmlParse(file = books_xml)
books_xml_root <- xmlRoot(books_parse)
books_data <- xmlToDataFrame (books_xml_root)
books_data## author title genre secondaryauthor
## 1 Steinbeck, John Of Mice and Men Fiction Aaron Judge
## 2 Kesey, Ken One Flew Over the Cuckoo's Nest Fiction
## 3 Hosseini, Khaled And the Mountains Echoed Fiction Didi Gregorius
## publish_date isbn
## 1 1937 9780142000670
## 2 1962 9780451163967
## 3 2013 9781594631764
#testing to see if this is a valid json file via isValidJSON
isValidJSON("https://raw.githubusercontent.com/aaronzal95/data607/master/books.json")## [1] TRUE
Using RJSONIO fromJSON function and dplyr and tidyr to transfer and clean books data from json to R dataframe
books_fromjson <- fromJSON(content="https://raw.githubusercontent.com/aaronzal95/data607/master/books.json"
, simplify = FALSE)
#data clean
unlist_json <- sapply(books_fromjson[[1]], unlist)
json_df <- as.data.frame(as.table(unlist_json))
dplyr_clean <- json_df %>%
select(-Var2) %>%
group_by(Var1) %>%
mutate(BookNumber= row_number()) %>%
spread(Var1, Freq)
dplyr_clean ## # A tibble: 3 x 7
## BookNumber title authors.Main authors.Seconda… year isbn genre
## <int> <fct> <fct> <fct> <fct> <fct> <fct>
## 1 1 Of Mice and Men John Steinbeck Aaron Judge 1937 978014… Fict…
## 2 2 One Flew Over … Ken Kesey "" 1962 978045… Fict…
## 3 3 And the Mounta… Khaled Hossei… Didi Gregorius 2013 978159… Fict…
books_html_url <- "https://raw.githubusercontent.com/aaronzal95/data607/master/books.html"
books_html <- getURL(books_html_url)Using XML readHTMLTable function to transfer books data from html to R. I will then run a call (do.call) to cast the list as a data frame.
html_df <- readHTMLTable(books_html, stringsAsFactors=FALSE)
do.call(rbind, lapply(html_df, data.frame, stringsAsFactors=FALSE))## X.id. X.Title. X.Main.Author.
## NULL.1 1 Of Mice and Men John Steinbeck
## NULL.2 2 One Flew Over the Cuckoo's Nest Ken Kesey
## NULL.3 3 And the Mountains Echoed Khaled Hosseini
## X.Secondary.Author. X.Published.Year. X.ISBN. X.Genre.
## NULL.1 Aaron Judge 1937 9780142000670 Fiction
## NULL.2 NA 1962 9780451163967 Fiction
## NULL.3 Didi Gregorius 2013 9781594631764 Fiction
The three dataframes are not identical. I believe that this is because of the different data structures. XML and HTML both use tags <></> in the syntax. Even though they both utilize tags, the original HTML file was slightly different because the column headers were listed prior to the data itself. The HTML data frame has a . for every space in the column headers and the headers start with a ‘X’. The JSON data structure on the other hand utilizes {} and :
Summary
I had to utilize different functions from various packages to address the different structures. I used the following functions for each data source type:
XML file: xmlParse,xmlRoot, xmlToDataFrame
JSON file: isValidJSON, fromJSON
HTML file: readHTMLTable, do.call for list
Use GET and content from httr package and input the New York Times provided api-key in order to retrieve the data. The idea of this key is similar to loading data into Redshift from S3 (requires ACCESS_KEY_ID and SECRET_ACCESS_KEY AWS)
Then use jsonlite’s fromJSON to extract the data and place into a data frame.
url_path <- "https://api.nytimes.com/svc/topstories/v2/food.json"
#ny times provided api key
httr_get_url <- GET(url_path, query = list("api-key" = "EDpv61o8hF7iFaiUM0yAdu8iZeGzrIW8"))
httr_content <- content(httr_get_url, "text")
food_data_json <- fromJSON(httr_content)
food_df <- as.data.frame(food_data_json)Use dplyr to select the four interested columns
food_df2 <- (food_df %>%
select(one_of(c("results.section", "results.title", "results.url","results.byline"))))
names (food_df2) <- c("Section", "Article Title", "Article Link", "Author")
food_df2%>%
kable() %>%
kable_styling()Utilize these two packages in order to load the inventory data from MySQL
library(DBI)
library(RMySQL)
#for table visual
library(kableExtra)
#for plot visual
library (ggplot2)mysql <- dbDriver("MySQL")
con <- dbConnect(mysql, dbname = 'inventory', user = 'root', password = 'cunydata', host = 'localhost')Selecting Raw Data From Three Tables
#using SQL code in R
products <- dbGetQuery(con, "SELECT * FROM products")
categories <- dbGetQuery(con, "SELECT * FROM categories")
suppliers <- dbGetQuery(con, "SELECT * FROM suppliers") Joined Data Frame
units_total <-as.data.frame(merged_data$unitPrice * merged_data$unitsInStock)
colnames(units_total) <- 'Total Available ($)'
finaldf <- cbind (merged_data [1:3],units_total$`Total Available ($)`)
#column headers of final data frame
colnames(finaldf)<-c("Product","Category","Supplier Company","Total Available ($)")
#rename some of the values from the Category column
finaldf$Category[(finaldf$Category)=="Grains/Cereals"]<- "Grains"
finaldf$Category[(finaldf$Category)== "Dairy Products"]<- "Dairy"
finaldf$Category[(finaldf$Category)=="Meat/Poultry"]<- "Meat"
glimpse(finaldf)## Observations: 77
## Variables: 4
## $ Product <chr> "Chai", "Chang", "Aniseed Syrup", "Chef Anton's…
## $ Category <chr> "Beverages", "Beverages", "Condiments", "Condim…
## $ `Supplier Company` <chr> "Exotic Liquids", "Exotic Liquids", "Exotic Liq…
## $ `Total Available ($)` <dbl> 702.00, 323.00, 130.00, 1166.00, 0.00, 1599.80,…
#range of product total available in dollars by category
geo_plot <- ggplot(finaldf, aes(x = Category, y = `Total Available ($)` ))
geo_plot + geom_line(aes(color = Category)) + ggtitle("Range of Available Stocked Products by Product Category")#ggplot2 scatterplot
ggScatPlot <- ggplot(finaldf, aes(x = Category, y = `Total Available ($)`)) + geom_point()
ggScatPlot <- ggScatPlot + geom_point(aes(color = Category))
ggScatPlot <- ggScatPlot + ggtitle("Total Worth in Available Stocked Products by Product Category")
ggScatPlotExport CSV Files for Github Repo (similar code used with Chess Tournament Data)
library (neo4r)
con <- neo4j_api$new(url = "http://localhost:7687", user = "neo4j", password = "cunydata")Load CSV Files from Github Repo to Neo4j Database (NO-SQL)
Relational Database (MySQL)
Advantages
Disadvantages
Graphic NoSQL Database (Neo4j)
Advantages
Disadvantages