October 18, 2017
From Excel to R
The R language is a statistical programming language, while Excel is a spreadsheet software where you can perform file operations on spreadsheets.
install.packages("googlesheets")
install.packages("readxl")
install.packages("XLConnect")
install.packages("ggplot2")
library(googlesheets)
## Warning: package 'googlesheets' was built under R version 3.3.3
# Authenticate
#gs_auth()
# List Directory and save
list1 <- gs_ls(); list1
## # A tibble: 24 x 10
## sheet_title author perm version
## <chr> <chr> <chr> <chr>
## 1 HubSpot Marketing Tools chris r new
## 2 173 - University of Cin<U+0085> danielle rw new
## 3 R Users Group Philippin<U+0085> josemarieant<U+0085> rw new
## 4 R Users Group Philippin<U+0085> joebrillantes rw new
## 5 R Users Group Philippin<U+0085> joebrillantes rw new
## 6 REO2_COMPUTATION_RRG_V1<U+0085> rowenremis.i<U+0085> rw new
## 7 [Sample] Infra Arch- RE<U+0085> zdrcardona rw new
## 8 Weekly Timesheet Report<U+0085> zjclopez r new
## 9 Spider list k rw new
## 10 REO2_COMPUTATION_RRG_V1<U+0085> dandelorcard<U+0085> rw new
## # ... with 14 more rows, and 6 more variables: updated <dttm>,
## # sheet_key <chr>, ws_feed <chr>, alternate <chr>, self <chr>,
## # alt_key <chr>
# Select a google sheet
gs1 <- gs_title("HubSpot Marketing Tools")
## Sheet successfully identified: "HubSpot Marketing Tools"
head(gs1)
## $sheet_key
## [1] "19RBrEhOK8-gs1qescFe7RbJgiTfMIVcz0Qm5n28lazU"
##
## $sheet_title
## [1] "HubSpot Marketing Tools"
##
## $n_ws
## [1] 1
##
## $ws_feed
## [1] "https://spreadsheets.google.com/feeds/worksheets/19RBrEhOK8-gs1qescFe7RbJgiTfMIVcz0Qm5n28lazU/private/values"
##
## $browser_url
## [1] "https://docs.google.com/spreadsheets/d/19RBrEhOK8-gs1qescFe7RbJgiTfMIVcz0Qm5n28lazU/"
##
## $updated
## [1] "2017-09-22 00:17:04 GMT"
# list worksheet
gs_ws_ls(gs1)
## [1] "Sheet1"
gs_test1 <- gs_read(gs1)
## Accessing worksheet titled 'Sheet1'.
## Warning: Missing column names filled in: 'X2' [2], 'X3' [3]
## Parsed with column specification:
## cols(
## `http://sumo.com` = col_character(),
## X2 = col_character(),
## X3 = col_character()
## )
head(gs_test1)
## # A tibble: 6 x 3
## `http://sumo.com` X2
## <chr> <chr>
## 1 HubSpot Tool Website
## 2 Scroll Box sumo.com/app/scroll-box
## 3 Content Upgrades sumo.com/app/list-builder
## 4 Sticky Share Bar sumo.com/app/share
## 5 Free CRM hubspot.com/products/crm
## 6 Facebook Messenger Bot motion.ai
## # ... with 1 more variables: X3 <chr>
dir()
## [1] "FilmData.csv" "FilmData.xlsx"
## [3] "hs_err_pid2996.log" "reg"
## [5] "replay_pid2996.log" "RUG_Excel_to_R_20171018.html"
## [7] "RUG_Excel_to_R_20171018.Rmd"
library(XLConnect)
## Warning: package 'XLConnect' was built under R version 3.3.3
## Loading required package: XLConnectJars
## Warning: package 'XLConnectJars' was built under R version 3.3.3
## XLConnect 0.2-13 by Mirai Solutions GmbH [aut],
## Martin Studer [cre],
## The Apache Software Foundation [ctb, cph] (Apache POI),
## Graph Builder [ctb, cph] (Curvesapi Java library)
## http://www.mirai-solutions.com ,
## http://miraisolutions.wordpress.com
wb <- loadWorkbook("FilmData.xlsx")
df1 <- readWorksheet(wb, sheet = 1, header = TRUE)
names(df1)
## [1] "Rank" "Film" "Studio" "Genre" "Year"
## [6] "Gross" "Gross.Dom" "Pct.Dom" "Gross.Ovr" "Pct.Ovr"
## [11] "Rotten" "IMDB" "Rating" "Days" "Budget"
head(df1)
## Rank Film Studio Genre
## 1 1 Avatar Fox Action/Adventure
## 2 2 Titanic Par. Drama
## 3 4 Harry Potter and the Deathly Hallows Part 2 WB Action/Adventure
## 4 8 Skyfall Sony Action/Adventure
## 5 9 The Dark Knight Rises WB Action/Adventure
## 6 13 Star Wars: Episode I - The Phantom Menace Fox Action/Adventure
## Year Gross Gross.Dom Pct.Dom Gross.Ovr Pct.Ovr Rotten IMDB Rating
## 1 2009 2782300000 760500000 0.273 2021800000 0.727 83 8.0 PG13
## 2 1997 2185400000 658700000 0.301 1526700000 0.699 88 7.6 PG13
## 3 2011 1341500000 381000000 0.284 960500000 0.716 96 8.1 PG13
## 4 2012 1108600000 304400000 0.275 804200000 0.725 92 7.8 PG13
## 5 2012 1084400000 448100000 0.413 636300000 0.587 87 8.6 PG13
## 6 1999 1027000000 474500000 0.462 552500000 0.538 57 6.5 PG
## Days Budget
## 1 238 237
## 2 219 200
## 3 133 125
## 4 122 200
## 5 147 250
## 6 261 115
library(readxl)
## Warning: package 'readxl' was built under R version 3.3.3
df2 <- read_excel("FilmData.xlsx")
names(df2)
## [1] "Rank" "Film" "Studio" "Genre" "Year"
## [6] "Gross" "Gross.Dom" "Pct.Dom" "Gross.Ovr" "Pct.Ovr"
## [11] "Rotten" "IMDB" "Rating" "Days" "Budget"
head(df2)
## # A tibble: 6 x 15
## Rank Film Studio
## <dbl> <chr> <chr>
## 1 1 Avatar Fox
## 2 2 Titanic Par.
## 3 4 Harry Potter and the Deathly Hallows Part 2 WB
## 4 8 Skyfall Sony
## 5 9 The Dark Knight Rises WB
## 6 13 Star Wars: Episode I - The Phantom Menace Fox
## # ... with 12 more variables: Genre <chr>, Year <dbl>, Gross <dbl>,
## # Gross.Dom <dbl>, Pct.Dom <dbl>, Gross.Ovr <dbl>, Pct.Ovr <dbl>,
## # Rotten <dbl>, IMDB <dbl>, Rating <chr>, Days <dbl>, Budget <dbl>
#View(df2)
## Access by Sheet name
# get Sheet names
excel_sheets("FilmData.xlsx")
## [1] "FilmData"
# Access by Sheet Number
head(read_excel("FilmData.xlsx",1))
## # A tibble: 6 x 15
## Rank Film Studio
## <dbl> <chr> <chr>
## 1 1 Avatar Fox
## 2 2 Titanic Par.
## 3 4 Harry Potter and the Deathly Hallows Part 2 WB
## 4 8 Skyfall Sony
## 5 9 The Dark Knight Rises WB
## 6 13 Star Wars: Episode I - The Phantom Menace Fox
## # ... with 12 more variables: Genre <chr>, Year <dbl>, Gross <dbl>,
## # Gross.Dom <dbl>, Pct.Dom <dbl>, Gross.Ovr <dbl>, Pct.Ovr <dbl>,
## # Rotten <dbl>, IMDB <dbl>, Rating <chr>, Days <dbl>, Budget <dbl>
Get Summary
Note: Excel and R behave differently on standard deviation, please check and test as the behavior differs because of N (population) and small n (sample)
# For some graphs
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.3
# options
options("scipen"=100, "digits"=4)
# Column names
names(df2)
## [1] "Rank" "Film" "Studio" "Genre" "Year"
## [6] "Gross" "Gross.Dom" "Pct.Dom" "Gross.Ovr" "Pct.Ovr"
## [11] "Rotten" "IMDB" "Rating" "Days" "Budget"
# Inspect data structure
str(df2)
## Classes 'tbl_df', 'tbl' and 'data.frame': 151 obs. of 15 variables:
## $ Rank : num 1 2 4 8 9 13 15 16 17 18 ...
## $ Film : chr "Avatar" "Titanic" "Harry Potter and the Deathly Hallows Part 2" "Skyfall" ...
## $ Studio : chr "Fox" "Par." "WB" "Sony" ...
## $ Genre : chr "Action/Adventure" "Drama" "Action/Adventure" "Action/Adventure" ...
## $ Year : num 2009 1997 2011 2012 2012 ...
## $ Gross : num 2782300000 2185400000 1341500000 1108600000 1084400000 ...
## $ Gross.Dom: num 760500000 658700000 381000000 304400000 448100000 ...
## $ Pct.Dom : num 0.273 0.301 0.284 0.275 0.413 0.462 0.298 0.532 0.326 0.415 ...
## $ Gross.Ovr: num 2021800000 1526700000 960500000 804200000 636300000 ...
## $ Pct.Ovr : num 0.727 0.699 0.716 0.725 0.587 0.538 0.702 0.468 0.674 0.585 ...
## $ Rotten : num 83 88 96 92 87 57 65 94 80 92 ...
## $ IMDB : num 8 7.6 8.1 7.8 8.6 6.5 8.1 9 7.3 8 ...
## $ Rating : chr "PG13" "PG13" "PG13" "PG13" ...
## $ Days : num 238 219 133 122 147 261 133 231 162 147 ...
## $ Budget : num 237 200 125 200 250 115 180 185 125 63 ...
# Dimension
dim(df2)
## [1] 151 15
# Number of Rows
nrow(df2)
## [1] 151
# Number of Columns
ncol(df2)
## [1] 15
# Get Overview of Gross through Histogram
hist(df2$Gross)
ggplot(data=df2, aes(Gross )) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Average
mean(df2$Gross)
## [1] 564067550
# Median
median(df2$Gross)
## [1] 470700000
# Standard Deviation
sd(df2$Gross)
## [1] 304307894
# Table on Genre
table(df2$Genre)
##
## Action/Adventure Animation Comedy Drama
## 111 14 17 9
# Proportion on Genre
prop.table(table(df2$Genre))
##
## Action/Adventure Animation Comedy Drama
## 0.73510 0.09272 0.11258 0.05960
# Plot
ggplot(data = df2, aes(x=Genre, fill=Genre)) + geom_bar()
# Summary Statistics
summary(df2$Gross)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 332000000 379000000 471000000 564000000 624000000 2780000000
summary(df2)
## Rank Film Studio Genre
## Min. : 1.0 Length:151 Length:151 Length:151
## 1st Qu.: 69.5 Class :character Class :character Class :character
## Median :126.0 Mode :character Mode :character Mode :character
## Mean :124.9
## 3rd Qu.:183.5
## Max. :245.0
##
## Year Gross Gross.Dom Pct.Dom
## Min. :1973 Min. : 332000000 Min. : 77600000 Min. :0.184
## 1st Qu.:1998 1st Qu.: 378950000 1st Qu.:155350000 1st Qu.:0.328
## Median :2005 Median : 470700000 Median :196600000 Median :0.398
## Mean :2003 Mean : 564067550 Mean :221336424 Mean :0.409
## 3rd Qu.:2009 3rd Qu.: 623950000 3rd Qu.:257050000 3rd Qu.:0.489
## Max. :2013 Max. :2782300000 Max. :760500000 Max. :0.753
##
## Gross.Ovr Pct.Ovr Rotten IMDB
## Min. : 85100000 Min. :0.247 Min. : 13.0 Min. :4.70
## 1st Qu.: 208700000 1st Qu.:0.511 1st Qu.: 52.5 1st Qu.:6.40
## Median : 277000000 Median :0.602 Median : 70.0 Median :7.10
## Mean : 342735099 Mean :0.591 Mean : 66.7 Mean :7.04
## 3rd Qu.: 403300000 3rd Qu.:0.672 3rd Qu.: 84.0 3rd Qu.:7.60
## Max. :2021800000 Max. :0.816 Max. :100.0 Max. :9.00
##
## Rating Days Budget
## Length:151 Min. : 56 Min. : 6.0
## Class :character 1st Qu.:115 1st Qu.: 69.2
## Mode :character Median :140 Median :110.0
## Mean :141 Mean :109.7
## 3rd Qu.:158 3rd Qu.:150.0
## Max. :280 Max. :270.0
## NA's :1
# Get Unique values
unique(df2$Genre)
## [1] "Action/Adventure" "Drama" "Animation"
## [4] "Comedy"
# Max
max(df2$Gross)
## [1] 2782300000
# Min
min(df2$Gross)
## [1] 332000000
# Splitting a string
str1 <- "freshman,sophomore,junior,senior"; str1
## [1] "freshman,sophomore,junior,senior"
HS_levels <- unlist(strsplit(str1, ",")); HS_levels
## [1] "freshman" "sophomore" "junior" "senior"
http://blog.revolutionanalytics.com/2015/09/using-the-googlesheets-package-to-work-with-google-sheets.html - R googlesheets package https://github.com/jennybc/googlesheets - Google Sheets R Api https://www.r-bloggers.com/a-million-ways-to-connect-r-and-excel/ - A million ways to connect R and Excel https://www.quantinsti.com/blog/rexcel-tutorial-using-r-excel/ - RExcel Tutorial http://www.sthda.com/english/wiki/r-xlsx-package-a-quick-start-guide-to-manipulate-excel-files-in-r - R xlsx Package https://blog.rstudio.com/2015/04/15/readxl-0-1-0/ - Get data out of excel and into R with readxl
sessionInfo()
## R version 3.3.2 (2016-10-31)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 7 x64 (build 7601) Service Pack 1
##
## locale:
## [1] LC_COLLATE=English_United States.1252
## [2] LC_CTYPE=English_United States.1252
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.1252
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggplot2_2.2.1 readxl_1.0.0 XLConnect_0.2-13
## [4] XLConnectJars_0.2-13 googlesheets_0.2.2
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.10 knitr_1.17 xml2_1.0.0 magrittr_1.5
## [5] munsell_0.4.3 colorspace_1.3-1 R6_2.2.0 rlang_0.1.1
## [9] plyr_1.8.4 stringr_1.1.0 httr_1.2.1 dplyr_0.5.0
## [13] tools_3.3.2 grid_3.3.2 gtable_0.2.0 DBI_0.6
## [17] htmltools_0.3.5 openssl_0.9.5 yaml_2.1.13 lazyeval_0.2.0
## [21] assertthat_0.1 rprojroot_1.1 digest_0.6.10 tibble_1.3.4
## [25] rJava_0.9-8 purrr_0.2.2 readr_1.0.0 curl_2.3
## [29] evaluate_0.10 rmarkdown_1.5 labeling_0.3 stringi_1.1.2
## [33] cellranger_1.1.0 scales_0.4.1 backports_1.1.0 jsonlite_1.1