Data Acquisition

Each tab represents different ways of loading data into R and the variety of file types (txt, XML, JSON, HTML, API, Neo4j, MySQL) R is compatible with.

Chess Tournament txt Data

library(stringr)
library(dplyr)
library(kableExtra)

Raw Chess Tournament Data

Use readLines function to load txt file

Chess_RawData <- readLines("/Users/aaronzalki/Desktop/tournamentinfo.txt")
head(Chess_RawData, 10)

##  [1] "-----------------------------------------------------------------------------------------" 
##  [2] " Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| "
##  [3] " Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | "
##  [4] "-----------------------------------------------------------------------------------------" 
##  [5] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|" 
##  [6] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |" 
##  [7] "-----------------------------------------------------------------------------------------" 
##  [8] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|" 
##  [9] "   MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |" 
## [10] "-----------------------------------------------------------------------------------------"

As seen above, every third row starting with row 5 contains the chess player names and every third row starting on row 6 contains states and ratings.

Transfer Data to Data Frame

Player Name

#every third row starting on row 5
player_names <- seq(5, 196, 3)
names_and_rounds <- Chess_RawData[player_names]
head(names_and_rounds)

## [1] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|"
## [2] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|"
## [3] "    3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  13|W  12|"
## [4] "    4 | PATRICK H SCHILLING             |5.5  |W  23|D  28|W   2|W  26|D   5|W  19|D   1|"
## [5] "    5 | HANSHI ZUO                      |5.5  |W  45|W  37|D  12|D  13|D   4|W  14|W  17|"
## [6] "    6 | HANSEN SONG                     |5.0  |W  34|D  29|L  11|W  35|D  10|W  27|W  21|"

State & Ratings

#every third row starting on row 6
st_rate <- seq(6, 196, 3)
state_and_ratings <- Chess_RawData[st_rate]
head(state_and_ratings)

## [1] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
## [2] "   MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |"
## [3] "   MI | 14959604 / R: 1384   ->1640     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
## [4] "   MI | 12616049 / R: 1716   ->1744     |N:2  |W    |B    |W    |B    |W    |B    |B    |"
## [5] "   MI | 14601533 / R: 1655   ->1690     |N:2  |B    |W    |B    |W    |B    |W    |B    |"
## [6] "   OH | 15055204 / R: 1686   ->1687     |N:3  |W    |B    |W    |B    |B    |W    |B    |"

Extraction

Names

extracted_names <- unlist(str_extract_all(names_and_rounds,
                                          
"\\w+\\s\\w+\\s\\w+\\s\\w+\\s|\\w+\\s\\w+\\s\\w+|\\w+\\s\\w+"))
head(extracted_names)

## [1] "GARY HUA"            "DAKSHESH DARURI"     "ADITYA BAJAJ"       
## [4] "PATRICK H SCHILLING" "HANSHI ZUO"          "HANSEN SONG"

States

extracted_states <- unlist(str_extract_all(state_and_ratings, "\\s\\w{2}\\s"))
head(extracted_states)

## [1] " ON " " MI " " MI " " MI " " MI " " OH "

Pre Ratings

preratings <- unlist(str_extract_all(state_and_ratings, "[:]\\s+[[:digit:]]+"))
preratings <- unlist(str_extract_all(preratings, "[[:digit:]]+"))
head(preratings)

## [1] "1794" "1553" "1384" "1716" "1655" "1686"

Post Ratings

postratings <- unlist(str_extract_all(state_and_ratings, "[>]\\s*[[:digit:]]+"))
postratings <- unlist(str_extract_all(postratings, "[[:digit:]]+"))
head(postratings)

## [1] "1817" "1663" "1640" "1744" "1690" "1687"

Rounds

allrounds <- unlist(str_extract_all(names_and_rounds, "\\s?[[:print:]]{1,2}[|][[:alpha:]]?"))
round1 <- allrounds[seq(4, length(allrounds), 10)]
round2 <- allrounds[seq(5, length(allrounds), 10)]
round3 <- allrounds[seq(6, length(allrounds), 10)]
round4 <- allrounds[seq(7, length(allrounds), 10)]
round5 <- allrounds[seq(8, length(allrounds), 10)]
round6 <- allrounds[seq(9, length(allrounds), 10)]
round7 <- allrounds[seq(10, length(allrounds), 10)]

round1 <- unlist(str_extract_all(round1, "[[:print:]]+[^[|]][^[:alpha:]_]"))
round1 <- unlist(str_extract_all(round1, "[[:print:]]+[^[|]]"))
round2 <- unlist(str_extract_all(round2, "[[:print:]]+[^[|]][^[:alpha:]_]"))
round2 <- unlist(str_extract_all(round2, "[[:print:]]+[^[|]]"))
round3 <- unlist(str_extract_all(round3, "[[:print:]]+[^[|]][^[:alpha:]_]"))
round3 <- unlist(str_extract_all(round3, "[[:print:]]+[^[|]]"))
round4 <- unlist(str_extract_all(round4, "[[:print:]]+[^[|]][^[:alpha:]_]"))
round4 <- unlist(str_extract_all(round4, "[[:print:]]+[^[|]]"))
round5 <- unlist(str_extract_all(round5, "[[:print:]]+[^[|]][^[:alpha:]_]"))
round5 <- unlist(str_extract_all(round5, "[[:print:]]+[^[|]]"))
round6 <- unlist(str_extract_all(round6, "[[:print:]]+[^[|]][^[:alpha:]_]"))
round6 <- unlist(str_extract_all(round6, "[[:print:]]+[^[|]]"))
round7 <- unlist(str_extract_all(round7, "[[:print:]]+[^[|]]"))

head (round1)

## [1] " 39" " 63" "  8" " 23" " 45" " 34"

Data Frame

id <- seq(1, 64, 1)
Chess_No_Avg<- data.frame(id, extracted_names, extracted_states,TableChess$Total,
                          "Pre Rating" = as.numeric(preratings),
                          "Post Rating" = as.numeric(postratings), 
                          "Round 1" = as.numeric(round1), "Round 2" = as.numeric(round2), 
                          "Round 3" = as.numeric(round3), 
                          "Round 4" = as.numeric(round4), "Round 5" = as.numeric(round5),
                          "Round 6" = as.numeric(round6), "Round 7" = as.numeric(round7))
#Column Names
colnames(Chess_No_Avg) <- c ("Id", "Player Name", "State","Total Points", "Pre Rating",
                             "Post Rating", "Round 1","Round 2","Round 3","Round 4",
                             "Round 5","Round 6","Round 7")

glimpse(Chess_No_Avg)

## Observations: 64
## Variables: 13
## $ Id             <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,…
## $ `Player Name`  <fct> GARY HUA, DAKSHESH DARURI, ADITYA BAJAJ, PATRICK H SCH…
## $ State          <fct>  ON ,  MI ,  MI ,  MI ,  MI ,  OH ,  MI ,  MI ,  ON , …
## $ `Total Points` <dbl> 6.0, 6.0, 6.0, 5.5, 5.5, 5.0, 5.0, 5.0, 5.0, 5.0, 4.5,…
## $ `Pre Rating`   <dbl> 1794, 1553, 1384, 1716, 1655, 1686, 1649, 1641, 1411, …
## $ `Post Rating`  <dbl> 1817, 1663, 1640, 1744, 1690, 1687, 1673, 1657, 1564, …
## $ `Round 1`      <dbl> 39, 63, 8, 23, 45, 34, 57, 3, 25, 16, 38, 42, 36, 54, …
## $ `Round 2`      <dbl> 21, 58, 61, 28, 37, 29, 46, 32, 18, 19, 56, 33, 27, 44…
## $ `Round 3`      <dbl> 18, 4, 25, 2, 12, 11, 13, 14, 59, 55, 6, 5, 7, 8, 30, …
## $ `Round 4`      <dbl> 14, 17, 21, 26, 13, 35, 11, 9, 8, 31, 7, 38, 5, 1, 22,…
## $ `Round 5`      <dbl> 7, 16, 11, 5, 4, 10, 1, 47, 26, 6, 3, NA, 33, 27, 54, …
## $ `Round 6`      <dbl> 12, 20, 13, 19, 14, 27, 9, 28, 7, 25, 34, 1, 3, 5, 33,…
## $ `Round 7`      <dbl> 4, 7, 12, 1, 17, 21, 2, 19, 20, 18, 26, 3, 32, 31, 38,…

Loop Used to Calculate Average Pre Chess Rating of Opponent

Avg_PreRating <- c()

for (i in Chess_No_Avg$Id){
  a <- Chess_No_Avg[Chess_No_Avg$'Round 1'==i, "Pre Rating"]
  a <- a[!is.na(a)]
  b <- Chess_No_Avg[Chess_No_Avg$'Round 2'==i, "Pre Rating"]
  b <- b[!is.na(b)]
  c <- Chess_No_Avg[Chess_No_Avg$'Round 3'==i, "Pre Rating"]
  c <- c[!is.na(c)]
  d <- Chess_No_Avg[Chess_No_Avg$'Round 4'==i, "Pre Rating"]
  d <- d[!is.na(d)]
  e <- Chess_No_Avg[Chess_No_Avg$'Round 5'==i, "Pre Rating"]
  e <- e[!is.na(e)]
  f <- Chess_No_Avg[Chess_No_Avg$'Round 6'==i, "Pre Rating"]
  f <- f[!is.na(f)]
  g <- Chess_No_Avg[Chess_No_Avg$'Round 7'==i, "Pre Rating"]
  g <- g[!is.na(g)]
  avg_rate <- mean(c(a, b, c, d, e, f, g))
  Avg_PreRating  <- c(Avg_PreRating, avg_rate)}

Final Data Frame

#add calculations from above loop to initial Data Frame
Chess_No_Avg ["Opponent Pre Rating"] <- Avg_PreRating
FinalDataCSV  <- subset(Chess_No_Avg, select = c ("Id", "Player Name", "State",
                            "Pre Rating", "Opponent Pre Rating"))
FinalDataCSV %>%
  kable() %>%
  kable_styling()

Id	Player Name	State	Pre Rating	Opponent Pre Rating
1	GARY HUA	ON	1794	1605.286
2	DAKSHESH DARURI	MI	1553	1469.286
3	ADITYA BAJAJ	MI	1384	1563.571
4	PATRICK H SCHILLING	MI	1716	1573.571
5	HANSHI ZUO	MI	1655	1500.857
6	HANSEN SONG	OH	1686	1518.714
7	GARY DEE SWATHELL	MI	1649	1372.143
8	EZEKIEL HOUGHTON	MI	1641	1468.429
9	STEFANO LEE	ON	1411	1523.143
10	ANVIT RAO	MI	1365	1554.143
11	CAMERON WILLIAM MC LEMAN	MI	1712	1467.571
12	KENNETH J TACK	MI	1663	1506.167
13	TORRANCE HENRY JR	MI	1666	1497.857
14	BRADLEY SHAW	MI	1610	1515.000
15	ZACHARY JAMES HOUGHTON	MI	1220	1483.857
16	MIKE NIKITIN	MI	1604	1385.800
17	RONALD GRZEGORCZYK	MI	1629	1498.571
18	DAVID SUNDEEN	MI	1600	1480.000
19	DIPANKAR ROY	MI	1564	1426.286
20	JASON ZHENG	MI	1595	1410.857
21	DINH DANG BUI	ON	1563	1470.429
22	EUGENE L MCCLURE	MI	1555	1300.333
23	ALAN BUI	ON	1363	1213.857
24	MICHAEL R ALDRICH	MI	1229	1357.000
25	LOREN SCHWIEBERT	MI	1745	1363.286
26	MAX ZHU	ON	1579	1506.857
27	GAURAV GIDWANI	MI	1552	1221.667
28	SOFIA ADINA STANESCU	MI	1507	1522.143
29	CHIEDOZIE OKORIE	MI	1602	1313.500
30	GEORGE AVERY JONES	ON	1522	1144.143
31	RISHI SHETTY	MI	1494	1259.857
32	JOSHUA PHILIP MATHEWS	ON	1441	1378.714
33	JADE GE	MI	1449	1276.857
34	MICHAEL JEFFERY THOMAS	MI	1399	1375.286
35	JOSHUA DAVID LEE	MI	1438	1149.714
36	SIDDHARTH JHA	MI	1355	1388.167
37	AMIYATOSH PWNANANDAM	MI	980	1384.800
38	BRIAN LIU	MI	1423	1539.167
39	JOEL R HENDON	MI	1436	1429.571
40	FOREST ZHANG	MI	1348	1390.571
41	KYLE WILLIAM MURPHY	MI	1403	1248.500
42	JARED GE	MI	1332	1149.857
43	ROBERT GLEN VASEY	MI	1283	1106.571
44	JUSTIN D SCHILLING	MI	1199	1327.000
45	DEREK YAN	MI	1242	1152.000
46	JACOB ALEXANDER LAVALLEY	MI	377	1357.714
47	ERIC WRIGHT	MI	1362	1392.000
48	DANIEL KHAIN	MI	1382	1355.800
49	MICHAEL J MARTIN	MI	1291	1285.800
50	SHIVAM JHA	MI	1056	1296.000
51	TEJAS AYYAGARI	MI	1011	1356.143
52	ETHAN GUO	MI	935	1494.571
53	JOSE C YBARRA	MI	1393	1345.333
54	LARRY HODGE	MI	1270	1206.167
55	ALEX KONG	MI	1186	1406.000
56	MARISA RICCI	MI	1153	1414.400
57	MICHAEL LU	MI	1092	1363.000
58	VIRAJ MOHILE	MI	917	1391.000
59	SEAN M MC CORMICK	MI	853	1319.000
60	JULIA SHEN	MI	967	1330.200
61	JEZZEL FARKAS	ON	955	1327.286
62	ASHWIN BALAJI	MI	1530	1186.000
63	THOMAS JOSEPH HOSMER	MI	1175	1350.200
64	BEN LI	MI	1163	1263.000

In case a user is more comfortable with Microsoft Excel, we can save the final dataframe into a csv file with the following code:

#export dataframe to csv
write.csv(FinalDataCSV, "Chess Tournament.csv")

XML, JSON, HTML

XML

library("XML")
library("RCurl")

books_xml_url <- "https://raw.githubusercontent.com/aaronzal95/data607/master/books.xml"
#RCurl getURL
books_xml <- getURL(books_xml_url)

Using XML xmlParse function, xmlRoot function and xmlToDataFrame function to transfer books data from xml to R dataframe

books_parse <- xmlParse(file = books_xml)
books_xml_root <- xmlRoot(books_parse)

books_data <- xmlToDataFrame (books_xml_root)
books_data

##             author                           title   genre secondaryauthor
## 1  Steinbeck, John                 Of Mice and Men Fiction     Aaron Judge
## 2       Kesey, Ken One Flew Over the Cuckoo's Nest Fiction                
## 3 Hosseini, Khaled        And the Mountains Echoed Fiction  Didi Gregorius
##   publish_date          isbn
## 1         1937 9780142000670
## 2         1962 9780451163967
## 3         2013 9781594631764

JSON

library(RJSONIO)
library(dplyr)
library(tidyr)

#testing to see if this is a valid json file via isValidJSON
isValidJSON("https://raw.githubusercontent.com/aaronzal95/data607/master/books.json")

## [1] TRUE

Using RJSONIO fromJSON function and dplyr and tidyr to transfer and clean books data from json to R dataframe

books_fromjson <- fromJSON(content="https://raw.githubusercontent.com/aaronzal95/data607/master/books.json"
                           , simplify = FALSE)

#data clean
unlist_json <- sapply(books_fromjson[[1]], unlist)
json_df <- as.data.frame(as.table(unlist_json))
dplyr_clean <- json_df %>%
  select(-Var2) %>%
  group_by(Var1) %>%
  mutate(BookNumber= row_number()) %>%
  spread(Var1, Freq)
dplyr_clean

## # A tibble: 3 x 7
##   BookNumber title           authors.Main   authors.Seconda… year  isbn    genre
##        <int> <fct>           <fct>          <fct>            <fct> <fct>   <fct>
## 1          1 Of Mice and Men John Steinbeck Aaron Judge      1937  978014… Fict…
## 2          2 One Flew Over … Ken Kesey      ""               1962  978045… Fict…
## 3          3 And the Mounta… Khaled Hossei… Didi Gregorius   2013  978159… Fict…

HTML

books_html_url <- "https://raw.githubusercontent.com/aaronzal95/data607/master/books.html"
books_html <- getURL(books_html_url)

Using XML readHTMLTable function to transfer books data from html to R. I will then run a call (do.call) to cast the list as a data frame.

html_df <- readHTMLTable(books_html, stringsAsFactors=FALSE)
do.call(rbind, lapply(html_df, data.frame, stringsAsFactors=FALSE))

##        X.id.                        X.Title.  X.Main.Author.
## NULL.1     1                 Of Mice and Men  John Steinbeck
## NULL.2     2 One Flew Over the Cuckoo's Nest       Ken Kesey
## NULL.3     3        And the Mountains Echoed Khaled Hosseini
##        X.Secondary.Author. X.Published.Year.       X.ISBN. X.Genre.
## NULL.1         Aaron Judge              1937 9780142000670  Fiction
## NULL.2                  NA              1962 9780451163967  Fiction
## NULL.3      Didi Gregorius              2013 9781594631764  Fiction

Conclusion

The three dataframes are not identical. I believe that this is because of the different data structures. XML and HTML both use tags <></> in the syntax. Even though they both utilize tags, the original HTML file was slightly different because the column headers were listed prior to the data itself. The HTML data frame has a . for every space in the column headers and the headers start with a ‘X’. The JSON data structure on the other hand utilizes {} and :

Summary

I had to utilize different functions from various packages to address the different structures. I used the following functions for each data source type:

XML file: xmlParse,xmlRoot, xmlToDataFrame

JSON file: isValidJSON, fromJSON

HTML file: readHTMLTable, do.call for list

NY Times API

library(httr)
library(jsonlite)
library(dplyr)
library(stringr)
library(kableExtra)

Use GET and content from httr package and input the New York Times provided api-key in order to retrieve the data. The idea of this key is similar to loading data into Redshift from S3 (requires ACCESS_KEY_ID and SECRET_ACCESS_KEY AWS)

Then use jsonlite’s fromJSON to extract the data and place into a data frame.

url_path <- "https://api.nytimes.com/svc/topstories/v2/food.json"
#ny times provided api key
httr_get_url <- GET(url_path, query = list("api-key" = "EDpv61o8hF7iFaiUM0yAdu8iZeGzrIW8"))
httr_content <- content(httr_get_url, "text")


food_data_json <- fromJSON(httr_content)
food_df <- as.data.frame(food_data_json)

Use dplyr to select the four interested columns

food_df2 <- (food_df %>%
  select(one_of(c("results.section", "results.title",  "results.url","results.byline"))))
 
names (food_df2) <- c("Section", "Article Title", "Article Link", "Author")
food_df2%>%
  kable() %>%
  kable_styling()

Section	Article Title	Article Link	Author
dining	Finding Strength in Sofrito in Puerto Rico	https://www.nytimes.com/2020/10/26/dining/puerto-rican-food.html	By Von Diaz
dining	Capturing the Heat and Crunch of Indonesian Cooking	https://www.nytimes.com/2020/10/27/dining/indonesian-food-lara-lee.html	By Kayla Stewart
dining	How Income Inequality Has Erased Your Chance to Drink the Great Wines	https://www.nytimes.com/2020/10/22/dining/drinks/wine-prices.html	By Eric Asimov
dining	Aiming for Perfect Asian Tapas, Even in a Pandemic	https://www.nytimes.com/2020/10/27/dining/mokyo-review.html	By Pete Wells
dining	Le Crocodile in the Wythe Hotel Opens for Private Dining	https://www.nytimes.com/2020/10/27/dining/nyc-restaurant-news.html	By Florence Fabricant
dining	For Best Results, Eat This Roti Immediately	https://www.nytimes.com/2020/10/26/dining/bridgetown-roti-los-angeles.html	By Tejal Rao
dining	This Dessert Goes Out to the Crust Lovers	https://www.nytimes.com/2020/10/23/dining/maple-pecan-galette-recipe.html	By Melissa Clark
dining	You’ll Definitely Want to Keep This Savory Sauce on Hand	https://www.nytimes.com/2020/10/23/dining/roasted-red-pepper-sauce-couscous-cake.html	By Yotam Ottolenghi
dining	Share Your Story of Keeping America Fed	https://www.nytimes.com/2020/10/26/dining/coronavirus-essential-workers-food.html	By The New York Times
nyregion	1.5 Million New Yorkers Can’t Afford Food. Pantries Are Their Lifeline.	https://www.nytimes.com/interactive/2020/10/20/nyregion/nyc-food-banks.html	By Nikita Stewart and Todd Heisler
dining	Seasons Change. Classic Drinks Do, Too.	https://www.nytimes.com/2020/10/22/dining/drinks/hot-toddy-recipe.html	By Rebekah Peppler
dining	The Promise of Pawpaw	https://www.nytimes.com/2020/10/19/dining/pawpaw-climate-change.html	By Rachel Wharton
dining	Turmoil, Strife and Wine: Reds From Lebanon	https://www.nytimes.com/2020/10/08/dining/drinks/wine-school-assignment-lebanon-red.html	By Eric Asimov
dining	An Around the World Kind of Rum	https://www.nytimes.com/2020/10/26/dining/plantation-rum-isle-of-fiji.html	By Florence Fabricant
dining	Your New Favorite Sauce	https://www.nytimes.com/2020/10/28/dining/your-new-favorite-sauce.html	By Sam Sifton
nyregion	N.J’s Largest City Shuts Down Again as Virus Cases Surge	https://www.nytimes.com/2020/10/28/nyregion/newark-coronavirus-curfew.html	By Tracey Tully and Kevin Armstrong
science	Sharks Wash Up on Beaches, Stabbed by Swordfish	https://www.nytimes.com/2020/10/27/science/swordfish-stabbing-sharks.html	By Joshua Sokol
dining	Pearl River Mart Opens a Food Market	https://www.nytimes.com/2020/10/27/dining/pearl-river-mart-foods-chelsea-market.html	By Florence Fabricant
insider	Giving Recipe Creators Their Due	https://www.nytimes.com/2020/10/27/insider/recipe-bylines.html	By Elisha Brown
dining	For a Cheese Lover, Two New Books	https://www.nytimes.com/2020/10/26/dining/cheese-books.html	By Florence Fabricant
dining	This Chile Crisp Has a Momofuku Pedigree	https://www.nytimes.com/2020/10/26/dining/chili-crunch-momofuku.html	By Florence Fabricant
dining	A West Coast Kouign-Amann	https://www.nytimes.com/2020/10/26/dining/manresa-bakery-kouign-amann.html	By Florence Fabricant
dining	Culinary Historians Turn to Zoom	https://www.nytimes.com/2020/10/26/dining/culinary-historians-ny-ben-katchor.html	By Florence Fabricant
dining	It’s Braising Season!	https://www.nytimes.com/2020/10/26/dining/its-braising-season.html	By Sam Sifton
dining	What to Cook This Week	https://www.nytimes.com/2020/10/25/dining/what-to-cook-this-week.html	By Sam Sifton
at-home	Hail the Supermoon And Howl at a Canine Costume Contest	https://www.nytimes.com/2020/10/24/at-home/things-to-do-this-week.html	By Adriana Balsamo and Katherine Cusumano
at-home	Ideas for Lifting Your Mood This Weekend	https://www.nytimes.com/2020/10/23/at-home/newsletter.html	By Melissa Kirsch
dining	One-Pot Deluxe	https://www.nytimes.com/2020/10/23/dining/one-pot-deluxe.html	By Emily Weinstein
dining	What to Cook This Weekend	https://www.nytimes.com/2020/10/23/dining/what-to-cook-this-weekend.html	By Sam Sifton
t-magazine	How to Create Your Own Herbal Tea Garden	https://www.nytimes.com/2020/10/22/t-magazine/how-to-grow-herbal-tea-garden.html	By Aimee Farrell
at-home	Take Comfort in Pumpkin This Season	https://www.nytimes.com/2020/10/22/at-home/pumpkin-recipes.html	By Margaux Laskey
t-magazine	The T List: Five Things We Recommend This Week	https://www.nytimes.com/2020/10/22/t-magazine/batsheva-troutbeck-compound.html
at-home	Finding Comfort in the In-Between Season	https://www.nytimes.com/2020/10/21/at-home/newsletter.html	By Melissa Kirsch
dining	The Case for Sausage Rolls	https://www.nytimes.com/2020/10/21/dining/the-case-for-sausage-rolls.html	By Sam Sifton
world	2 Broke Artists Started a Bakery at Home. It’s a Pandemic Hit.	https://www.nytimes.com/2020/10/21/world/americas/mexico-city-quarantine-baking.html	By Natalie Kitroeff
magazine	This Tuna-Salad Sandwich Is Julia Child-Approved Lunch	https://www.nytimes.com/2020/10/21/magazine/this-tuna-salad-sandwich-is-julia-child-approved-lunch.html	By Dorie Greenspan
magazine	The Fed-Up Chef	https://www.nytimes.com/2020/10/21/magazine/gaggan-anand.html	By Sheila Marikar
dining	City Winery Opens in Hudson River Park	https://www.nytimes.com/2020/10/20/dining/nyc-restaurant-news.html	By Florence Fabricant

Author Analysis

## Remove the word By
clean_author_df2 <- str_replace_all (food_df2$Author, "By", "")
author_df <- as.data.frame(clean_author_df2)
names(author_df) <- c("Author")

counts <- table (author_df$Author)


barplot(counts[order(counts)], horiz = TRUE, main="Author Food Article Frequency",
        ylab="Author Name", border = "black", col ="maroon", density = 10)

Let’s print the number of articles each author had since some of their names are too long for the above \(y-\)axis.

print (counts)

## 
##                                          Adriana Balsamo and Katherine Cusumano 
##                                       1                                       1 
##                           Aimee Farrell                         Dorie Greenspan 
##                                       1                                       1 
##                            Elisha Brown                         Emily Weinstein 
##                                       1                                       1 
##                             Eric Asimov                      Florence Fabricant 
##                                       2                                       8 
##                            Joshua Sokol                           Kayla Stewart 
##                                       1                                       1 
##                          Margaux Laskey                           Melissa Clark 
##                                       1                                       1 
##                          Melissa Kirsch                        Natalie Kitroeff 
##                                       2                                       1 
##         Nikita Stewart and Todd Heisler                              Pete Wells 
##                                       1                                       1 
##                          Rachel Wharton                         Rebekah Peppler 
##                                       1                                       1 
##                              Sam Sifton                          Sheila Marikar 
##                                       5                                       1 
##                               Tejal Rao                      The New York Times 
##                                       1                                       1 
##        Tracey Tully and Kevin Armstrong                                Von Diaz 
##                                       1                                       1 
##                        Yotam Ottolenghi 
##                                       1

Conclusion: Florence Fabricant is the author with the most New York Times food articles within the last week (8).

NoSQL vs. MySQL

Utilize these two packages in order to load the inventory data from MySQL

library(DBI)
library(RMySQL)
#for table visual
library(kableExtra)
#for plot visual
library (ggplot2)

Database Connection Code

mysql <- dbDriver("MySQL")
con <- dbConnect(mysql, dbname = 'inventory', user = 'root', password = 'cunydata', host = 'localhost')

Selecting Raw Data From Three Tables

#using SQL code in R
products <- dbGetQuery(con, "SELECT * FROM products") 
categories <- dbGetQuery(con, "SELECT * FROM categories") 
suppliers <- dbGetQuery(con, "SELECT * FROM suppliers")

Joined Data Frame

merged_data <- dbGetQuery(con, "SELECT a.productName, b.categoryName, c.companyName, a.unitPrice, a.unitsInStock  
FROM products as a 
INNER JOIN suppliers as c on a.supplierID=c.supplierID
INNER JOIN categories as b on a.categoryID=b.categoryID")

Final Data Frame

units_total <-as.data.frame(merged_data$unitPrice * merged_data$unitsInStock) 
colnames(units_total) <- 'Total Available ($)'
finaldf <- cbind (merged_data [1:3],units_total$`Total Available ($)`)
#column headers of final data frame
colnames(finaldf)<-c("Product","Category","Supplier Company","Total Available ($)")

#rename some of the values from the Category column
finaldf$Category[(finaldf$Category)=="Grains/Cereals"]<- "Grains" 
finaldf$Category[(finaldf$Category)== "Dairy Products"]<- "Dairy"
finaldf$Category[(finaldf$Category)=="Meat/Poultry"]<- "Meat"

glimpse(finaldf)

## Observations: 77
## Variables: 4
## $ Product               <chr> "Chai", "Chang", "Aniseed Syrup", "Chef Anton's…
## $ Category              <chr> "Beverages", "Beverages", "Condiments", "Condim…
## $ `Supplier Company`    <chr> "Exotic Liquids", "Exotic Liquids", "Exotic Liq…
## $ `Total Available ($)` <dbl> 702.00, 323.00, 130.00, 1166.00, 0.00, 1599.80,…

Analysis

#range of product total available in dollars by category
geo_plot <- ggplot(finaldf, aes(x = Category, y = `Total Available ($)` ))
geo_plot + geom_line(aes(color = Category))  + ggtitle("Range of Available Stocked Products by Product Category")

#ggplot2 scatterplot
ggScatPlot <- ggplot(finaldf, aes(x = Category, y = `Total Available ($)`)) + geom_point()
ggScatPlot <- ggScatPlot + geom_point(aes(color = Category))  
ggScatPlot <- ggScatPlot + ggtitle("Total Worth in Available Stocked Products by Product Category")
ggScatPlot

Export CSV Files for Github Repo (similar code used with Chess Tournament Data)

write.csv(products, "products.csv")
write.csv(categories, "categories.csv")
write.csv(suppliers, "suppliers.csv")

Neo4j Database Connection

library (neo4r)                     
con <- neo4j_api$new(url = "http://localhost:7687", user = "neo4j", password = "cunydata")

Load CSV Files from Github Repo to Neo4j Database (NO-SQL)

Conclusion

Relational Database (MySQL)

Advantages

Data is structured and organized
Commonly used Structured Query Langauge

Disadvantages

Fixed schema for organizing data that can be difficult to edit once implemented (lack of flexibility)

Graphic NoSQL Database (Neo4j)

Advantages

Faster processing/querying/adding new data
Flexible and can take in data without worrying about null values, relationship issues, or data structures
Build relationships based on nodes, so it is easier to analyze relationships between nodes (i.e there were 8 nodes for ‘category’)

Disadvantages

No standard query language
No predefined schema

Data Acquisition

Aaron Zalki

10/28/2020

Chess Tournament txt Data

Chess Tournament txt Data

Raw Chess Tournament Data

Transfer Data to Data Frame

Data Frame

Final Data Frame

XML, JSON, HTML

XML, JSON, HTML

XML

JSON

HTML

Conclusion

NY Times API

NY Times API

Author Analysis

NoSQL vs. MySQL

NoSQL vs. MySQL

Database Connection Code

Final Data Frame

Analysis

Neo4j Database Connection

Conclusion