R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(ggplot2)

Including Plots

You can also embed plots, for example:

# Economics dataset from GGally package
head(economics)
## # A tibble: 6 × 6
##   date         pce    pop psavert uempmed unemploy
##   <date>     <dbl>  <dbl>   <dbl>   <dbl>    <dbl>
## 1 1967-07-01  507. 198712    12.6     4.5     2944
## 2 1967-08-01  510. 198911    12.6     4.7     2945
## 3 1967-09-01  516. 199113    11.9     4.6     2958
## 4 1967-10-01  512. 199311    12.9     4.9     3143
## 5 1967-11-01  517. 199498    12.8     4.7     3066
## 6 1967-12-01  525. 199657    11.8     4.8     3018
# Calculate correlation between pce and psavert
cor(economics$pce, economics$psavert)
## [1] -0.7928546

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

# Calculate each part of the correlation manually
xPart <- economics$pce - mean(economics$pce)
yPart <- economics$psavert - mean(economics$psavert)
nMinusOne <- (nrow(economics) - 1)
xSD <- sd(economics$pce)
ySD <- sd(economics$psavert)

# Use the correlation formula
sum(xPart * yPart) / (nMinusOne * xSD * ySD)
## [1] -0.7928546
# Calculate each part of the correlation manually
xPart <- economics$pce - mean(economics$pce)
yPart <- economics$psavert - mean(economics$psavert)
nMinusOne <- (nrow(economics) - 1)
xSD <- sd(economics$pce)
ySD <- sd(economics$psavert)

# Use the correlation formula
sum(xPart * yPart) / (nMinusOne * xSD * ySD)
# Calculate the correlation matrix for variables 2, 4, 5, 6 in the economics dataset
cormat <- round(cor(economics[, c(2, 4:6)]), 2)
head(cormat)
##            pce psavert uempmed unemploy
## pce       1.00   -0.79    0.73     0.61
## psavert  -0.79    1.00   -0.33    -0.31
## uempmed   0.73   -0.33    1.00     0.87
## unemploy  0.61   -0.31    0.87     1.00
# Load the reshape2 package
library(reshape2)

# Reshape the correlation matrix into long format for plotting
melted_cormat <- reshape2::melt(cormat)
head(melted_cormat)
##       Var1    Var2 value
## 1      pce     pce  1.00
## 2  psavert     pce -0.79
## 3  uempmed     pce  0.73
## 4 unemploy     pce  0.61
## 5      pce psavert -0.79
## 6  psavert psavert  1.00
# Load the ggplot2 package
library(ggplot2)

# Plot the correlation matrix as a heatmap
ggplot(data = melted_cormat, aes(x=Var1, y=Var2, fill=value)) + 
  geom_tile()

# Dealing with missing data
m <- c(9, 9, NA, 3, NA, 5, 8, 1, 10, 4)
n <- c(2, NA, 1, 6, 6, 4, 1, 1, 6, 7)
p <- c(8, 4, 3, 9, 10, NA, 3, NA, 9, 9)
q <- c(10, 10, 7, 8, 4, 2, 8, 5, 5, 2)
r <- c(1, 9, 7, 6, 5, 6, 2, 7, 9, 10)

# Combine them together
theMat <- cbind(m, n, p, q, r)
theMat
##        m  n  p  q  r
##  [1,]  9  2  8 10  1
##  [2,]  9 NA  4 10  9
##  [3,] NA  1  3  7  7
##  [4,]  3  6  9  8  6
##  [5,] NA  6 10  4  5
##  [6,]  5  4 NA  2  6
##  [7,]  8  1  3  8  2
##  [8,]  1  1 NA  5  7
##  [9,] 10  6  9  5  9
## [10,]  4  7  9  2 10
# Compute correlations with different methods of handling NA values
cor(theMat, use="everything")
##    m  n  p          q          r
## m  1 NA NA         NA         NA
## n NA  1 NA         NA         NA
## p NA NA  1         NA         NA
## q NA NA NA  1.0000000 -0.4242958
## r NA NA NA -0.4242958  1.0000000
cor(theMat, use="complete.obs")
##            m          n          p          q          r
## m  1.0000000 -0.5228840 -0.2893527  0.2974398 -0.3459470
## n -0.5228840  1.0000000  0.8090195 -0.7448453  0.9350718
## p -0.2893527  0.8090195  1.0000000 -0.3613720  0.6221470
## q  0.2974398 -0.7448453 -0.3613720  1.0000000 -0.9059384
## r -0.3459470  0.9350718  0.6221470 -0.9059384  1.0000000
cor(theMat, use="na.or.complete")
##            m          n          p          q          r
## m  1.0000000 -0.5228840 -0.2893527  0.2974398 -0.3459470
## n -0.5228840  1.0000000  0.8090195 -0.7448453  0.9350718
## p -0.2893527  0.8090195  1.0000000 -0.3613720  0.6221470
## q  0.2974398 -0.7448453 -0.3613720  1.0000000 -0.9059384
## r -0.3459470  0.9350718  0.6221470 -0.9059384  1.0000000
# Calculate the correlation just on complete rows
cor(theMat[c(1, 4, 7, 9, 10), ])
##            m          n          p          q          r
## m  1.0000000 -0.5228840 -0.2893527  0.2974398 -0.3459470
## n -0.5228840  1.0000000  0.8090195 -0.7448453  0.9350718
## p -0.2893527  0.8090195  1.0000000 -0.3613720  0.6221470
## q  0.2974398 -0.7448453 -0.3613720  1.0000000 -0.9059384
## r -0.3459470  0.9350718  0.6221470 -0.9059384  1.0000000
# Compare "complete.obs" and computing on select rows
identical(cor(theMat, use="complete.obs"),
         cor(theMat[c(1, 4, 7, 9, 10), ]))
## [1] TRUE
# Load and explore the 'tips' dataset
data(tips, package="reshape2")
head(tips)
##   total_bill  tip    sex smoker day   time size
## 1      16.99 1.01 Female     No Sun Dinner    2
## 2      10.34 1.66   Male     No Sun Dinner    3
## 3      21.01 3.50   Male     No Sun Dinner    3
## 4      23.68 3.31   Male     No Sun Dinner    2
## 5      24.59 3.61 Female     No Sun Dinner    4
## 6      25.29 4.71   Male     No Sun Dinner    4
str(tips)
## 'data.frame':    244 obs. of  7 variables:
##  $ total_bill: num  17 10.3 21 23.7 24.6 ...
##  $ tip       : num  1.01 1.66 3.5 3.31 3.61 4.71 2 3.12 1.96 3.23 ...
##  $ sex       : Factor w/ 2 levels "Female","Male": 1 2 2 2 1 2 2 2 2 2 ...
##  $ smoker    : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ day       : Factor w/ 4 levels "Fri","Sat","Sun",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ time      : Factor w/ 2 levels "Dinner","Lunch": 1 1 1 1 1 1 1 1 1 1 ...
##  $ size      : int  2 3 3 2 4 4 2 4 2 2 ...
unique(tips$sex)
## [1] Female Male  
## Levels: Female Male
unique(tips$day)
## [1] Sun  Sat  Thur Fri 
## Levels: Fri Sat Sun Thur
# T-test to check if mean tip differs from 2.50
t.test(tips$tip, alternative="two.sided", mu=2.50)
## 
##  One Sample t-test
## 
## data:  tips$tip
## t = 5.6253, df = 243, p-value = 5.08e-08
## alternative hypothesis: true mean is not equal to 2.5
## 95 percent confidence interval:
##  2.823799 3.172758
## sample estimates:
## mean of x 
##  2.998279
# Plotting the t-distribution and test statistic
randT <- rt(30000, df=NROW(tips)-1)
tipTTest <- t.test(tips$tip, alternative="two.sided", mu=2.50)

ggplot(data.frame(x=randT)) +
  geom_density(aes(x=x), fill="grey", color="grey") +
  geom_vline(xintercept=tipTTest$statistic) +
  geom_vline(xintercept=mean(randT) + c(-2, 2)*sd(randT), linetype=2)

# Variance of tips by sex
aggregate(tip ~ sex, data=tips, var)
##      sex      tip
## 1 Female 1.344428
## 2   Male 2.217424
# Shapiro-Wilk normality test
shapiro.test(tips$tip)
## 
##  Shapiro-Wilk normality test
## 
## data:  tips$tip
## W = 0.89781, p-value = 8.2e-12
shapiro.test(tips$tip[tips$sex == "Female"])
## 
##  Shapiro-Wilk normality test
## 
## data:  tips$tip[tips$sex == "Female"]
## W = 0.95678, p-value = 0.005448
shapiro.test(tips$tip[tips$sex == "Male"])
## 
##  Shapiro-Wilk normality test
## 
## data:  tips$tip[tips$sex == "Male"]
## W = 0.87587, p-value = 3.708e-10
# Visual inspection of distribution
ggplot(tips, aes(x=tip, fill=sex)) +
  geom_histogram(binwidth=.5, alpha=1/2)

# One-way ANOVA for tip based on day
tipAnova <- aov(tip ~ day, tips)
summary(tipAnova)
##              Df Sum Sq Mean Sq F value Pr(>F)
## day           3    9.5   3.175   1.672  0.174
## Residuals   240  455.7   1.899
# ANOVA for tea types
green_tea <- c(0, 2, 3, 5, 8, 10, 12)
no_tea <- c(1, 2, 3, 9, 10, 10, 11)
peppermint_tea <- c(1, 4, 2, 5, 8, 9, 12)

tea_weight <- c(green_tea, no_tea, peppermint_tea)
tea_names <- c(rep("green_tea", 7), rep("no_tea", 7), rep("peppermint_tea",7))
weight_df <- data.frame(tea_names, tea_weight)

weightAnova <- aov(tea_weight ~ tea_names, weight_df)
summary(weightAnova)
##             Df Sum Sq Mean Sq F value Pr(>F)
## tea_names    2      3   1.476   0.082  0.922
## Residuals   18    326  18.111