R Users Group Philippines

October 18, 2017

From Excel to R

Excel to R

The R language is a statistical programming language, while Excel is a spreadsheet software where you can perform file operations on spreadsheets.

Installing the Libraries

install.packages("googlesheets")

install.packages("readxl")
install.packages("XLConnect")

install.packages("ggplot2")

googlesheets R Package

library(googlesheets)
## Warning: package 'googlesheets' was built under R version 3.3.3
# Authenticate
#gs_auth()

# List Directory and save
list1 <- gs_ls(); list1
## # A tibble: 24 x 10
##                 sheet_title        author  perm version
##                       <chr>         <chr> <chr>   <chr>
##  1  HubSpot Marketing Tools         chris     r     new
##  2 173 - University of Cin<U+0085>      danielle    rw     new
##  3 R Users Group Philippin<U+0085> josemarieant<U+0085>    rw     new
##  4 R Users Group Philippin<U+0085> joebrillantes    rw     new
##  5 R Users Group Philippin<U+0085> joebrillantes    rw     new
##  6 REO2_COMPUTATION_RRG_V1<U+0085> rowenremis.i<U+0085>    rw     new
##  7 [Sample] Infra Arch- RE<U+0085>    zdrcardona    rw     new
##  8 Weekly Timesheet Report<U+0085>      zjclopez     r     new
##  9              Spider list             k    rw     new
## 10 REO2_COMPUTATION_RRG_V1<U+0085> dandelorcard<U+0085>    rw     new
## # ... with 14 more rows, and 6 more variables: updated <dttm>,
## #   sheet_key <chr>, ws_feed <chr>, alternate <chr>, self <chr>,
## #   alt_key <chr>
# Select a google sheet
gs1 <- gs_title("HubSpot Marketing Tools")
## Sheet successfully identified: "HubSpot Marketing Tools"
head(gs1)
## $sheet_key
## [1] "19RBrEhOK8-gs1qescFe7RbJgiTfMIVcz0Qm5n28lazU"
## 
## $sheet_title
## [1] "HubSpot Marketing Tools"
## 
## $n_ws
## [1] 1
## 
## $ws_feed
## [1] "https://spreadsheets.google.com/feeds/worksheets/19RBrEhOK8-gs1qescFe7RbJgiTfMIVcz0Qm5n28lazU/private/values"
## 
## $browser_url
## [1] "https://docs.google.com/spreadsheets/d/19RBrEhOK8-gs1qescFe7RbJgiTfMIVcz0Qm5n28lazU/"
## 
## $updated
## [1] "2017-09-22 00:17:04 GMT"
# list worksheet
gs_ws_ls(gs1)
## [1] "Sheet1"
gs_test1 <- gs_read(gs1)
## Accessing worksheet titled 'Sheet1'.
## Warning: Missing column names filled in: 'X2' [2], 'X3' [3]
## Parsed with column specification:
## cols(
##   `http://sumo.com` = col_character(),
##   X2 = col_character(),
##   X3 = col_character()
## )
head(gs_test1)
## # A tibble: 6 x 3
##        `http://sumo.com`                        X2
##                    <chr>                     <chr>
## 1           HubSpot Tool                   Website
## 2             Scroll Box   sumo.com/app/scroll-box
## 3       Content Upgrades sumo.com/app/list-builder
## 4       Sticky Share Bar        sumo.com/app/share
## 5               Free CRM  hubspot.com/products/crm
## 6 Facebook Messenger Bot                 motion.ai
## # ... with 1 more variables: X3 <chr>

XLConnect Package

dir()
## [1] "FilmData.csv"                 "FilmData.xlsx"               
## [3] "hs_err_pid2996.log"           "reg"                         
## [5] "replay_pid2996.log"           "RUG_Excel_to_R_20171018.html"
## [7] "RUG_Excel_to_R_20171018.Rmd"
library(XLConnect)
## Warning: package 'XLConnect' was built under R version 3.3.3
## Loading required package: XLConnectJars
## Warning: package 'XLConnectJars' was built under R version 3.3.3
## XLConnect 0.2-13 by Mirai Solutions GmbH [aut],
##   Martin Studer [cre],
##   The Apache Software Foundation [ctb, cph] (Apache POI),
##   Graph Builder [ctb, cph] (Curvesapi Java library)
## http://www.mirai-solutions.com ,
## http://miraisolutions.wordpress.com
wb <- loadWorkbook("FilmData.xlsx")
df1 <- readWorksheet(wb, sheet = 1, header = TRUE)

names(df1)
##  [1] "Rank"      "Film"      "Studio"    "Genre"     "Year"     
##  [6] "Gross"     "Gross.Dom" "Pct.Dom"   "Gross.Ovr" "Pct.Ovr"  
## [11] "Rotten"    "IMDB"      "Rating"    "Days"      "Budget"
head(df1)
##   Rank                                        Film Studio            Genre
## 1    1                                      Avatar    Fox Action/Adventure
## 2    2                                     Titanic   Par.            Drama
## 3    4 Harry Potter and the Deathly Hallows Part 2     WB Action/Adventure
## 4    8                                     Skyfall   Sony Action/Adventure
## 5    9                       The Dark Knight Rises     WB Action/Adventure
## 6   13   Star Wars: Episode I - The Phantom Menace    Fox Action/Adventure
##   Year      Gross Gross.Dom Pct.Dom  Gross.Ovr Pct.Ovr Rotten IMDB Rating
## 1 2009 2782300000 760500000   0.273 2021800000   0.727     83  8.0   PG13
## 2 1997 2185400000 658700000   0.301 1526700000   0.699     88  7.6   PG13
## 3 2011 1341500000 381000000   0.284  960500000   0.716     96  8.1   PG13
## 4 2012 1108600000 304400000   0.275  804200000   0.725     92  7.8   PG13
## 5 2012 1084400000 448100000   0.413  636300000   0.587     87  8.6   PG13
## 6 1999 1027000000 474500000   0.462  552500000   0.538     57  6.5     PG
##   Days Budget
## 1  238    237
## 2  219    200
## 3  133    125
## 4  122    200
## 5  147    250
## 6  261    115

readxl Package

library(readxl)
## Warning: package 'readxl' was built under R version 3.3.3
df2 <- read_excel("FilmData.xlsx")
names(df2)
##  [1] "Rank"      "Film"      "Studio"    "Genre"     "Year"     
##  [6] "Gross"     "Gross.Dom" "Pct.Dom"   "Gross.Ovr" "Pct.Ovr"  
## [11] "Rotten"    "IMDB"      "Rating"    "Days"      "Budget"
head(df2)
## # A tibble: 6 x 15
##    Rank                                        Film Studio
##   <dbl>                                       <chr>  <chr>
## 1     1                                      Avatar    Fox
## 2     2                                     Titanic   Par.
## 3     4 Harry Potter and the Deathly Hallows Part 2     WB
## 4     8                                     Skyfall   Sony
## 5     9                       The Dark Knight Rises     WB
## 6    13   Star Wars: Episode I - The Phantom Menace    Fox
## # ... with 12 more variables: Genre <chr>, Year <dbl>, Gross <dbl>,
## #   Gross.Dom <dbl>, Pct.Dom <dbl>, Gross.Ovr <dbl>, Pct.Ovr <dbl>,
## #   Rotten <dbl>, IMDB <dbl>, Rating <chr>, Days <dbl>, Budget <dbl>
#View(df2)

## Access by Sheet name
# get Sheet names
excel_sheets("FilmData.xlsx")
## [1] "FilmData"
# Access by Sheet Number
head(read_excel("FilmData.xlsx",1))
## # A tibble: 6 x 15
##    Rank                                        Film Studio
##   <dbl>                                       <chr>  <chr>
## 1     1                                      Avatar    Fox
## 2     2                                     Titanic   Par.
## 3     4 Harry Potter and the Deathly Hallows Part 2     WB
## 4     8                                     Skyfall   Sony
## 5     9                       The Dark Knight Rises     WB
## 6    13   Star Wars: Episode I - The Phantom Menace    Fox
## # ... with 12 more variables: Genre <chr>, Year <dbl>, Gross <dbl>,
## #   Gross.Dom <dbl>, Pct.Dom <dbl>, Gross.Ovr <dbl>, Pct.Ovr <dbl>,
## #   Rotten <dbl>, IMDB <dbl>, Rating <chr>, Days <dbl>, Budget <dbl>

Excel Functions

  • Average
  • Median
  • Standard Deviation
  • Unique Value
  • Split text
  • Min
  • Max
  • Get Summary

  • Note: Excel and R behave differently on standard deviation, please check and test as the behavior differs because of N (population) and small n (sample)

# For some graphs
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.3
# options
options("scipen"=100, "digits"=4)

# Column names
names(df2)
##  [1] "Rank"      "Film"      "Studio"    "Genre"     "Year"     
##  [6] "Gross"     "Gross.Dom" "Pct.Dom"   "Gross.Ovr" "Pct.Ovr"  
## [11] "Rotten"    "IMDB"      "Rating"    "Days"      "Budget"
# Inspect data structure
str(df2)
## Classes 'tbl_df', 'tbl' and 'data.frame':    151 obs. of  15 variables:
##  $ Rank     : num  1 2 4 8 9 13 15 16 17 18 ...
##  $ Film     : chr  "Avatar" "Titanic" "Harry Potter and the Deathly Hallows Part 2" "Skyfall" ...
##  $ Studio   : chr  "Fox" "Par." "WB" "Sony" ...
##  $ Genre    : chr  "Action/Adventure" "Drama" "Action/Adventure" "Action/Adventure" ...
##  $ Year     : num  2009 1997 2011 2012 2012 ...
##  $ Gross    : num  2782300000 2185400000 1341500000 1108600000 1084400000 ...
##  $ Gross.Dom: num  760500000 658700000 381000000 304400000 448100000 ...
##  $ Pct.Dom  : num  0.273 0.301 0.284 0.275 0.413 0.462 0.298 0.532 0.326 0.415 ...
##  $ Gross.Ovr: num  2021800000 1526700000 960500000 804200000 636300000 ...
##  $ Pct.Ovr  : num  0.727 0.699 0.716 0.725 0.587 0.538 0.702 0.468 0.674 0.585 ...
##  $ Rotten   : num  83 88 96 92 87 57 65 94 80 92 ...
##  $ IMDB     : num  8 7.6 8.1 7.8 8.6 6.5 8.1 9 7.3 8 ...
##  $ Rating   : chr  "PG13" "PG13" "PG13" "PG13" ...
##  $ Days     : num  238 219 133 122 147 261 133 231 162 147 ...
##  $ Budget   : num  237 200 125 200 250 115 180 185 125 63 ...
# Dimension
dim(df2)
## [1] 151  15
# Number of Rows
nrow(df2)
## [1] 151
# Number of Columns
ncol(df2)
## [1] 15
# Get Overview of Gross through Histogram
hist(df2$Gross)

ggplot(data=df2, aes(Gross )) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Average
mean(df2$Gross)
## [1] 564067550
# Median
median(df2$Gross)
## [1] 470700000
# Standard Deviation
sd(df2$Gross)
## [1] 304307894
# Table on Genre
table(df2$Genre)
## 
## Action/Adventure        Animation           Comedy            Drama 
##              111               14               17                9
# Proportion on Genre
prop.table(table(df2$Genre))
## 
## Action/Adventure        Animation           Comedy            Drama 
##          0.73510          0.09272          0.11258          0.05960
# Plot
ggplot(data = df2, aes(x=Genre, fill=Genre)) + geom_bar()

# Summary Statistics
summary(df2$Gross)
##       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
##  332000000  379000000  471000000  564000000  624000000 2780000000
summary(df2)
##       Rank           Film              Studio             Genre          
##  Min.   :  1.0   Length:151         Length:151         Length:151        
##  1st Qu.: 69.5   Class :character   Class :character   Class :character  
##  Median :126.0   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :124.9                                                           
##  3rd Qu.:183.5                                                           
##  Max.   :245.0                                                           
##                                                                          
##       Year          Gross              Gross.Dom            Pct.Dom     
##  Min.   :1973   Min.   : 332000000   Min.   : 77600000   Min.   :0.184  
##  1st Qu.:1998   1st Qu.: 378950000   1st Qu.:155350000   1st Qu.:0.328  
##  Median :2005   Median : 470700000   Median :196600000   Median :0.398  
##  Mean   :2003   Mean   : 564067550   Mean   :221336424   Mean   :0.409  
##  3rd Qu.:2009   3rd Qu.: 623950000   3rd Qu.:257050000   3rd Qu.:0.489  
##  Max.   :2013   Max.   :2782300000   Max.   :760500000   Max.   :0.753  
##                                                                         
##    Gross.Ovr             Pct.Ovr          Rotten           IMDB     
##  Min.   :  85100000   Min.   :0.247   Min.   : 13.0   Min.   :4.70  
##  1st Qu.: 208700000   1st Qu.:0.511   1st Qu.: 52.5   1st Qu.:6.40  
##  Median : 277000000   Median :0.602   Median : 70.0   Median :7.10  
##  Mean   : 342735099   Mean   :0.591   Mean   : 66.7   Mean   :7.04  
##  3rd Qu.: 403300000   3rd Qu.:0.672   3rd Qu.: 84.0   3rd Qu.:7.60  
##  Max.   :2021800000   Max.   :0.816   Max.   :100.0   Max.   :9.00  
##                                                                     
##     Rating               Days         Budget     
##  Length:151         Min.   : 56   Min.   :  6.0  
##  Class :character   1st Qu.:115   1st Qu.: 69.2  
##  Mode  :character   Median :140   Median :110.0  
##                     Mean   :141   Mean   :109.7  
##                     3rd Qu.:158   3rd Qu.:150.0  
##                     Max.   :280   Max.   :270.0  
##                                   NA's   :1
# Get Unique values
unique(df2$Genre)
## [1] "Action/Adventure" "Drama"            "Animation"       
## [4] "Comedy"
# Max
max(df2$Gross)
## [1] 2782300000
# Min
min(df2$Gross)
## [1] 332000000
# Splitting a string
str1 <- "freshman,sophomore,junior,senior"; str1
## [1] "freshman,sophomore,junior,senior"
HS_levels <- unlist(strsplit(str1, ",")); HS_levels
## [1] "freshman"  "sophomore" "junior"    "senior"

References

http://blog.revolutionanalytics.com/2015/09/using-the-googlesheets-package-to-work-with-google-sheets.html - R googlesheets package
https://github.com/jennybc/googlesheets - Google Sheets R Api

https://www.r-bloggers.com/a-million-ways-to-connect-r-and-excel/ - A million ways to connect R and Excel
https://www.quantinsti.com/blog/rexcel-tutorial-using-r-excel/ - RExcel Tutorial
http://www.sthda.com/english/wiki/r-xlsx-package-a-quick-start-guide-to-manipulate-excel-files-in-r - R xlsx Package
https://blog.rstudio.com/2015/04/15/readxl-0-1-0/ - Get data out of excel and into R with readxl

About Me

Rowen Remis R. Iral

Data Science / IT Engineer consultant

http://wenup.wordpress.com

Session Info

sessionInfo()
## R version 3.3.2 (2016-10-31)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 7 x64 (build 7601) Service Pack 1
## 
## locale:
## [1] LC_COLLATE=English_United States.1252 
## [2] LC_CTYPE=English_United States.1252   
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] ggplot2_2.2.1        readxl_1.0.0         XLConnect_0.2-13    
## [4] XLConnectJars_0.2-13 googlesheets_0.2.2  
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.10     knitr_1.17       xml2_1.0.0       magrittr_1.5    
##  [5] munsell_0.4.3    colorspace_1.3-1 R6_2.2.0         rlang_0.1.1     
##  [9] plyr_1.8.4       stringr_1.1.0    httr_1.2.1       dplyr_0.5.0     
## [13] tools_3.3.2      grid_3.3.2       gtable_0.2.0     DBI_0.6         
## [17] htmltools_0.3.5  openssl_0.9.5    yaml_2.1.13      lazyeval_0.2.0  
## [21] assertthat_0.1   rprojroot_1.1    digest_0.6.10    tibble_1.3.4    
## [25] rJava_0.9-8      purrr_0.2.2      readr_1.0.0      curl_2.3        
## [29] evaluate_0.10    rmarkdown_1.5    labeling_0.3     stringi_1.1.2   
## [33] cellranger_1.1.0 scales_0.4.1     backports_1.1.0  jsonlite_1.1