head(bwt)
## id birthweight length gestation smoker
## 1 1313 5.8 17 33 0
## 2 431 4.2 19 33 1
## 3 808 6.4 19 34 0
## 4 300 4.5 18 35 1
## 5 516 5.8 18 35 1
## 6 321 6.8 19 37 0
str(bwt)
## 'data.frame': 42 obs. of 5 variables:
## $ id : int 1313 431 808 300 516 321 1363 575 822 1081 ...
## $ birthweight: num 5.8 4.2 6.4 4.5 5.8 6.8 5.2 6.1 7.5 8 ...
## $ length : int 17 19 19 18 18 19 19 19 19 21 ...
## $ gestation : int 33 33 34 35 35 37 37 37 38 38 ...
## $ smoker : int 0 1 0 1 1 0 1 1 0 0 ...
## View the values
freq(bwt$birthweight)
## Frequencies
## bwt$birthweight
## Type: Numeric
##
## Freq % Valid % Valid Cum. % Total % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
## 4.2 1 2.38 2.38 2.38 2.38
## 4.5 1 2.38 4.76 2.38 4.76
## 5.2 1 2.38 7.14 2.38 7.14
## 5.5 1 2.38 9.52 2.38 9.52
## 5.8 2 4.76 14.29 4.76 14.29
## 6 2 4.76 19.05 4.76 19.05
## 6.1 1 2.38 21.43 2.38 21.43
## 6.3 1 2.38 23.81 2.38 23.81
## 6.4 1 2.38 26.19 2.38 26.19
## 6.6 2 4.76 30.95 4.76 30.95
## 6.8 1 2.38 33.33 2.38 33.33
## 6.9 2 4.76 38.10 4.76 38.10
## 7 3 7.14 45.24 7.14 45.24
## 7.1 1 2.38 47.62 2.38 47.62
## 7.2 1 2.38 50.00 2.38 50.00
## 7.3 3 7.14 57.14 7.14 57.14
## 7.5 2 4.76 61.90 4.76 61.90
## 7.7 1 2.38 64.29 2.38 64.29
## 7.8 1 2.38 66.67 2.38 66.67
## 7.9 1 2.38 69.05 2.38 69.05
## 8 3 7.14 76.19 7.14 76.19
## 8.3 1 2.38 78.57 2.38 78.57
## 8.5 2 4.76 83.33 4.76 83.33
## 8.6 2 4.76 88.10 4.76 88.10
## 8.9 1 2.38 90.48 2.38 90.48
## 9 1 2.38 92.86 2.38 92.86
## 9.5 1 2.38 95.24 2.38 95.24
## 10 2 4.76 100.00 4.76 100.00
## <NA> 0 0.00 100.00
## Total 42 100.00 100.00 100.00 100.00
# make a simple histogram of birthweight
hist(bwt$birthweight, main = "Histogram of birthweight", xlab = "Birthweight (in pounds)")
# simple histogram of gestation
hist(bwt$gestation, main = "Histogram of gestation", xlab = "Gestation (in weeks)")
# an alternative way of making histogram using the ggplot2 package
ggplot(data = bwt, aes(x = birthweight)) +
geom_histogram(bins = 6, breaks = c(4,5,6,7,8,9,10),
fill = "grey", color = "black") +
labs(x = "Birthweight (in pounds)", y = "Frequency",
title = "Histogram of birthweight") +
theme_bw()
# get the mean, variance, and sd of birthweight and gestation
mean(bwt$birthweight)
## [1] 7.264286
var(bwt$birthweight)
## [1] 1.768206
sd(bwt$birthweight)
## [1] 1.329739
mean(bwt$gestation)
## [1] 39.19048
var(bwt$gestation)
## [1] 6.987224
sd(bwt$gestation)
## [1] 2.643336
# first, we can check to see if the original birthweight data had NAs:
anyNA((bwt[,"birthweight"]))
## [1] FALSE
# make new data frame w/ NAs
bwt_copy <- bwt # copy data file (with new name)
bwt_copy[1,"birthweight"] <- NA # sets the first observation in column birthweight to NA
is.na(bwt_copy[1,"birthweight"])
## [1] TRUE
anyNA((bwt_copy[,"birthweight"]))
## [1] TRUE
# functions will not work because of NA
mean(bwt_copy$birthweight)
## [1] NA
var(bwt_copy$birthweight)
## [1] NA
sd(bwt_copy$birthweight)
## [1] NA
#Option 1, modify function specifications
mean(bwt_copy$birthweight, na.rm=TRUE)
## [1] 7.3
#Option 2: use a different package/function that automatically ignores NA. One may also choose to write their own function using the function(x){} command to define this command. See chunk below as an example.
mean_(bwt_copy$birthweight)
## [1] 7.3
var_(bwt_copy$birthweight)
## [1] 1.7575
sd_(bwt_copy$birthweight)
## [1] 1.325707
Create ‘custom’ function called n_() for sum of non-missing observations to get count:
n_ <- function(x){
sum_(!is.na(x))
}
We tell R what n_ will do by writing code inside the function(x){} brackets
Now let’s try it out
n_(x = bwt_copy$birthweight) #with NA replacing one of the observations
## [1] 41
n_(x = bwt$birthweight) #without NA replacing one of the observations
## [1] 42
Alternate using psych::describe to give descriptives or just the number of valid observations:
# Of note, the :: operator basically means "use this function specifically from this package." This is because some R packages have duplicated function names and R may become confused or use the wrong one depending on which packages you've loaded in.
describe(bwt$birthweight)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 42 7.26 1.33 7.25 7.27 1.19 4.2 10 5.8 -0.05 -0.27 0.21
describe(bwt$birthweight)$n # can put other names after $ instead (median, etc.). You are basically asking for a specific column spit out by the describe command.
## [1] 42
Now let’s get descriptives for babies’ lengths at birth (their heights in inches)
# Plot length
hist(bwt$length, main = "Histogram of length", xlab = "Length (in inches)")
# Get single descriptives
mean_(bwt$length)
## [1] 19.92857
var_(bwt$length)
## [1] 1.238676
sd_(bwt$length)
## [1] 1.112958
n_(bwt$length)
## [1] 42
describe(bwt[,c("birthweight","length")]) #again using the c() vector to call multiple variables/cols
## vars n mean sd median trimmed mad min max range skew
## birthweight 1 42 7.26 1.33 7.25 7.27 1.19 4.2 10 5.8 -0.05
## length 2 42 19.93 1.11 20.00 19.94 1.48 17.0 22 5.0 -0.17
## kurtosis se
## birthweight -0.27 0.21
## length -0.23 0.17
I could also loop through the multiple variables using each of the individual hablar function (mean_, var_, sd_, n_) using sapply()
# By using the , we basically say we want to apply this to all rows among the specified variables encased in the c().
sapply(bwt[,c("birthweight","length")], mean_) # means
## birthweight length
## 7.264286 19.928571
sapply(bwt[,c("birthweight","length")], var_) # variances
## birthweight length
## 1.768206 1.238676
sapply(bwt[,c("birthweight","length")], sd_) # sds
## birthweight length
## 1.329739 1.112958
sapply(bwt[,c("birthweight","length")], n_) # ns
## birthweight length
## 42 42
# Scatter Plot with base r
plot(y = bwt$birthweight,
x = bwt$gestation,
xlab = "Gestation (in weeks)",
ylab = "Birthweight (in pounds)")
# fit a regression line
abline(reg = lm(birthweight ~ gestation, data = bwt, na.action = na.exclude),
h = mean_(bwt$birthweight),
v = mean_(bwt$gestation),
col = c("red","black","black"))
# Scatterplot with ggplot
ggplot(data = bwt, aes(x = gestation, y = birthweight)) +
geom_point(shape = 1) +
geom_smooth(method = "lm", formula = "y ~ x", se = F) +
geom_hline(yintercept = mean_(bwt$birthweight), linetype = "dashed", color = "red") +
geom_vline(xintercept = mean_(bwt$gestation), linetype = "dashed", color = "red") +
theme_bw()
This line suggests a positive covariance and correlation.
When length is above its mean, so is birthweight. When length is below its mean, so is birthweight.
We can calculate the covariance using the cov() function… #calculating covariance
cov(x = bwt$gestation, y = bwt$birthweight, use = "pairwise.complete.obs")
## [1] 2.482578
## covariance matrix (with variances on the diagonals)
cov(bwt[,c("birthweight","gestation")], use = "pairwise.complete.obs")
## birthweight gestation
## birthweight 1.768206 2.482578
## gestation 2.482578 6.987224
#calculating correlation
cor(x = bwt$gestation, y = bwt$birthweight, use = "pairwise.complete.obs")
## [1] 0.7062919
# correlation matrix. A more elaborate alternative you can use if you want to pull out other statistics from the correlation(s) is the cor.test() function.
cor(bwt[,c("birthweight","gestation")], use = "pairwise.complete.obs")
## birthweight gestation
## birthweight 1.0000000 0.7062919
## gestation 0.7062919 1.0000000
#Calculating r2
(r2 <- cor(x = bwt$gestation, y = bwt$birthweight, use = "pairwise.complete.obs")^2) #the ^2 specifies you want to square it.
## [1] 0.4988483
# as a percentage
r2*100
## [1] 49.88483
These variables share 48.6% of their variance with each other.
Finally, we will calculate some z-scores using the scale() function. Then we will see something interesting.
I will use dplyr piping %>% to make this easier
# The scale function is a way to z-transform variables as long as their class == numeric.
bwt <- bwt %>%
mutate(zbirthweight = scale(birthweight)[,1],
zgestation = scale(gestation)[,1])
If you take the covariance of z-scored variables, it is a correlation (more on that later…). Maybe you can see why if you ask for the full matrix (note the variances of z-scored variables…)
# Covariance
cov(bwt[,c("zbirthweight","zgestation")], use = "pairwise.complete.obs")
## zbirthweight zgestation
## zbirthweight 1.0000000 0.7062919
## zgestation 0.7062919 1.0000000
# Correlation
cor(bwt[,c("zbirthweight","zgestation")], use = "pairwise.complete.obs")
## zbirthweight zgestation
## zbirthweight 1.0000000 0.7062919
## zgestation 0.7062919 1.0000000