This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(ggplot2)
You can also embed plots, for example:
# Economics dataset from GGally package
head(economics)
## # A tibble: 6 × 6
## date pce pop psavert uempmed unemploy
## <date> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1967-07-01 507. 198712 12.6 4.5 2944
## 2 1967-08-01 510. 198911 12.6 4.7 2945
## 3 1967-09-01 516. 199113 11.9 4.6 2958
## 4 1967-10-01 512. 199311 12.9 4.9 3143
## 5 1967-11-01 517. 199498 12.8 4.7 3066
## 6 1967-12-01 525. 199657 11.8 4.8 3018
# Calculate correlation between pce and psavert
cor(economics$pce, economics$psavert)
## [1] -0.7928546
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.
# Calculate each part of the correlation manually
xPart <- economics$pce - mean(economics$pce)
yPart <- economics$psavert - mean(economics$psavert)
nMinusOne <- (nrow(economics) - 1)
xSD <- sd(economics$pce)
ySD <- sd(economics$psavert)
# Use the correlation formula
sum(xPart * yPart) / (nMinusOne * xSD * ySD)
## [1] -0.7928546
# Calculate each part of the correlation manually
xPart <- economics$pce - mean(economics$pce)
yPart <- economics$psavert - mean(economics$psavert)
nMinusOne <- (nrow(economics) - 1)
xSD <- sd(economics$pce)
ySD <- sd(economics$psavert)
# Use the correlation formula
sum(xPart * yPart) / (nMinusOne * xSD * ySD)
# Calculate the correlation matrix for variables 2, 4, 5, 6 in the economics dataset
cormat <- round(cor(economics[, c(2, 4:6)]), 2)
head(cormat)
## pce psavert uempmed unemploy
## pce 1.00 -0.79 0.73 0.61
## psavert -0.79 1.00 -0.33 -0.31
## uempmed 0.73 -0.33 1.00 0.87
## unemploy 0.61 -0.31 0.87 1.00
# Load the reshape2 package
library(reshape2)
# Reshape the correlation matrix into long format for plotting
melted_cormat <- reshape2::melt(cormat)
head(melted_cormat)
## Var1 Var2 value
## 1 pce pce 1.00
## 2 psavert pce -0.79
## 3 uempmed pce 0.73
## 4 unemploy pce 0.61
## 5 pce psavert -0.79
## 6 psavert psavert 1.00
# Load the ggplot2 package
library(ggplot2)
# Plot the correlation matrix as a heatmap
ggplot(data = melted_cormat, aes(x=Var1, y=Var2, fill=value)) +
geom_tile()
# Dealing with missing data
m <- c(9, 9, NA, 3, NA, 5, 8, 1, 10, 4)
n <- c(2, NA, 1, 6, 6, 4, 1, 1, 6, 7)
p <- c(8, 4, 3, 9, 10, NA, 3, NA, 9, 9)
q <- c(10, 10, 7, 8, 4, 2, 8, 5, 5, 2)
r <- c(1, 9, 7, 6, 5, 6, 2, 7, 9, 10)
# Combine them together
theMat <- cbind(m, n, p, q, r)
theMat
## m n p q r
## [1,] 9 2 8 10 1
## [2,] 9 NA 4 10 9
## [3,] NA 1 3 7 7
## [4,] 3 6 9 8 6
## [5,] NA 6 10 4 5
## [6,] 5 4 NA 2 6
## [7,] 8 1 3 8 2
## [8,] 1 1 NA 5 7
## [9,] 10 6 9 5 9
## [10,] 4 7 9 2 10
# Compute correlations with different methods of handling NA values
cor(theMat, use="everything")
## m n p q r
## m 1 NA NA NA NA
## n NA 1 NA NA NA
## p NA NA 1 NA NA
## q NA NA NA 1.0000000 -0.4242958
## r NA NA NA -0.4242958 1.0000000
cor(theMat, use="complete.obs")
## m n p q r
## m 1.0000000 -0.5228840 -0.2893527 0.2974398 -0.3459470
## n -0.5228840 1.0000000 0.8090195 -0.7448453 0.9350718
## p -0.2893527 0.8090195 1.0000000 -0.3613720 0.6221470
## q 0.2974398 -0.7448453 -0.3613720 1.0000000 -0.9059384
## r -0.3459470 0.9350718 0.6221470 -0.9059384 1.0000000
cor(theMat, use="na.or.complete")
## m n p q r
## m 1.0000000 -0.5228840 -0.2893527 0.2974398 -0.3459470
## n -0.5228840 1.0000000 0.8090195 -0.7448453 0.9350718
## p -0.2893527 0.8090195 1.0000000 -0.3613720 0.6221470
## q 0.2974398 -0.7448453 -0.3613720 1.0000000 -0.9059384
## r -0.3459470 0.9350718 0.6221470 -0.9059384 1.0000000
# Calculate the correlation just on complete rows
cor(theMat[c(1, 4, 7, 9, 10), ])
## m n p q r
## m 1.0000000 -0.5228840 -0.2893527 0.2974398 -0.3459470
## n -0.5228840 1.0000000 0.8090195 -0.7448453 0.9350718
## p -0.2893527 0.8090195 1.0000000 -0.3613720 0.6221470
## q 0.2974398 -0.7448453 -0.3613720 1.0000000 -0.9059384
## r -0.3459470 0.9350718 0.6221470 -0.9059384 1.0000000
# Compare "complete.obs" and computing on select rows
identical(cor(theMat, use="complete.obs"),
cor(theMat[c(1, 4, 7, 9, 10), ]))
## [1] TRUE
# Load and explore the 'tips' dataset
data(tips, package="reshape2")
head(tips)
## total_bill tip sex smoker day time size
## 1 16.99 1.01 Female No Sun Dinner 2
## 2 10.34 1.66 Male No Sun Dinner 3
## 3 21.01 3.50 Male No Sun Dinner 3
## 4 23.68 3.31 Male No Sun Dinner 2
## 5 24.59 3.61 Female No Sun Dinner 4
## 6 25.29 4.71 Male No Sun Dinner 4
str(tips)
## 'data.frame': 244 obs. of 7 variables:
## $ total_bill: num 17 10.3 21 23.7 24.6 ...
## $ tip : num 1.01 1.66 3.5 3.31 3.61 4.71 2 3.12 1.96 3.23 ...
## $ sex : Factor w/ 2 levels "Female","Male": 1 2 2 2 1 2 2 2 2 2 ...
## $ smoker : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ day : Factor w/ 4 levels "Fri","Sat","Sun",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ time : Factor w/ 2 levels "Dinner","Lunch": 1 1 1 1 1 1 1 1 1 1 ...
## $ size : int 2 3 3 2 4 4 2 4 2 2 ...
unique(tips$sex)
## [1] Female Male
## Levels: Female Male
unique(tips$day)
## [1] Sun Sat Thur Fri
## Levels: Fri Sat Sun Thur
# T-test to check if mean tip differs from 2.50
t.test(tips$tip, alternative="two.sided", mu=2.50)
##
## One Sample t-test
##
## data: tips$tip
## t = 5.6253, df = 243, p-value = 5.08e-08
## alternative hypothesis: true mean is not equal to 2.5
## 95 percent confidence interval:
## 2.823799 3.172758
## sample estimates:
## mean of x
## 2.998279
# Plotting the t-distribution and test statistic
randT <- rt(30000, df=NROW(tips)-1)
tipTTest <- t.test(tips$tip, alternative="two.sided", mu=2.50)
ggplot(data.frame(x=randT)) +
geom_density(aes(x=x), fill="grey", color="grey") +
geom_vline(xintercept=tipTTest$statistic) +
geom_vline(xintercept=mean(randT) + c(-2, 2)*sd(randT), linetype=2)
# Variance of tips by sex
aggregate(tip ~ sex, data=tips, var)
## sex tip
## 1 Female 1.344428
## 2 Male 2.217424
# Shapiro-Wilk normality test
shapiro.test(tips$tip)
##
## Shapiro-Wilk normality test
##
## data: tips$tip
## W = 0.89781, p-value = 8.2e-12
shapiro.test(tips$tip[tips$sex == "Female"])
##
## Shapiro-Wilk normality test
##
## data: tips$tip[tips$sex == "Female"]
## W = 0.95678, p-value = 0.005448
shapiro.test(tips$tip[tips$sex == "Male"])
##
## Shapiro-Wilk normality test
##
## data: tips$tip[tips$sex == "Male"]
## W = 0.87587, p-value = 3.708e-10
# Visual inspection of distribution
ggplot(tips, aes(x=tip, fill=sex)) +
geom_histogram(binwidth=.5, alpha=1/2)
# One-way ANOVA for tip based on day
tipAnova <- aov(tip ~ day, tips)
summary(tipAnova)
## Df Sum Sq Mean Sq F value Pr(>F)
## day 3 9.5 3.175 1.672 0.174
## Residuals 240 455.7 1.899
# ANOVA for tea types
green_tea <- c(0, 2, 3, 5, 8, 10, 12)
no_tea <- c(1, 2, 3, 9, 10, 10, 11)
peppermint_tea <- c(1, 4, 2, 5, 8, 9, 12)
tea_weight <- c(green_tea, no_tea, peppermint_tea)
tea_names <- c(rep("green_tea", 7), rep("no_tea", 7), rep("peppermint_tea",7))
weight_df <- data.frame(tea_names, tea_weight)
weightAnova <- aov(tea_weight ~ tea_names, weight_df)
summary(weightAnova)
## Df Sum Sq Mean Sq F value Pr(>F)
## tea_names 2 3 1.476 0.082 0.922
## Residuals 18 326 18.111