View the data

head(bwt)
##     id birthweight length gestation smoker
## 1 1313         5.8     17        33      0
## 2  431         4.2     19        33      1
## 3  808         6.4     19        34      0
## 4  300         4.5     18        35      1
## 5  516         5.8     18        35      1
## 6  321         6.8     19        37      0
str(bwt)
## 'data.frame':    42 obs. of  5 variables:
##  $ id         : int  1313 431 808 300 516 321 1363 575 822 1081 ...
##  $ birthweight: num  5.8 4.2 6.4 4.5 5.8 6.8 5.2 6.1 7.5 8 ...
##  $ length     : int  17 19 19 18 18 19 19 19 19 21 ...
##  $ gestation  : int  33 33 34 35 35 37 37 37 38 38 ...
##  $ smoker     : int  0 1 0 1 1 0 1 1 0 0 ...

Descriptives for Birthweight

## View the values
freq(bwt$birthweight)
## Frequencies  
## bwt$birthweight  
## Type: Numeric  
## 
##               Freq   % Valid   % Valid Cum.   % Total   % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
##         4.2      1      2.38           2.38      2.38           2.38
##         4.5      1      2.38           4.76      2.38           4.76
##         5.2      1      2.38           7.14      2.38           7.14
##         5.5      1      2.38           9.52      2.38           9.52
##         5.8      2      4.76          14.29      4.76          14.29
##           6      2      4.76          19.05      4.76          19.05
##         6.1      1      2.38          21.43      2.38          21.43
##         6.3      1      2.38          23.81      2.38          23.81
##         6.4      1      2.38          26.19      2.38          26.19
##         6.6      2      4.76          30.95      4.76          30.95
##         6.8      1      2.38          33.33      2.38          33.33
##         6.9      2      4.76          38.10      4.76          38.10
##           7      3      7.14          45.24      7.14          45.24
##         7.1      1      2.38          47.62      2.38          47.62
##         7.2      1      2.38          50.00      2.38          50.00
##         7.3      3      7.14          57.14      7.14          57.14
##         7.5      2      4.76          61.90      4.76          61.90
##         7.7      1      2.38          64.29      2.38          64.29
##         7.8      1      2.38          66.67      2.38          66.67
##         7.9      1      2.38          69.05      2.38          69.05
##           8      3      7.14          76.19      7.14          76.19
##         8.3      1      2.38          78.57      2.38          78.57
##         8.5      2      4.76          83.33      4.76          83.33
##         8.6      2      4.76          88.10      4.76          88.10
##         8.9      1      2.38          90.48      2.38          90.48
##           9      1      2.38          92.86      2.38          92.86
##         9.5      1      2.38          95.24      2.38          95.24
##          10      2      4.76         100.00      4.76         100.00
##        <NA>      0                               0.00         100.00
##       Total     42    100.00         100.00    100.00         100.00
# make a simple histogram of birthweight 
hist(bwt$birthweight, main = "Histogram of birthweight", xlab = "Birthweight (in pounds)")

# simple histogram of gestation
hist(bwt$gestation, main = "Histogram of gestation", xlab = "Gestation (in weeks)")

# an alternative way of making histogram using the ggplot2 package
ggplot(data = bwt, aes(x = birthweight)) +
  geom_histogram(bins = 6, breaks = c(4,5,6,7,8,9,10),
                 fill = "grey", color = "black") +
  labs(x = "Birthweight (in pounds)", y = "Frequency",
       title = "Histogram of birthweight") +
  theme_bw()

# get the mean, variance, and sd of birthweight and gestation
mean(bwt$birthweight)
## [1] 7.264286
var(bwt$birthweight)
## [1] 1.768206
sd(bwt$birthweight)
## [1] 1.329739
mean(bwt$gestation)
## [1] 39.19048
var(bwt$gestation)
## [1] 6.987224
sd(bwt$gestation)
## [1] 2.643336

Example Dealing with Missing (NA) Data

# first, we can check to see if the original birthweight data had NAs:
anyNA((bwt[,"birthweight"]))
## [1] FALSE
# make new data frame w/ NAs
bwt_copy <- bwt # copy data file (with new name) 
bwt_copy[1,"birthweight"] <- NA # sets the first observation in column birthweight to NA

is.na(bwt_copy[1,"birthweight"])
## [1] TRUE
anyNA((bwt_copy[,"birthweight"]))
## [1] TRUE
# functions will not work because of NA
mean(bwt_copy$birthweight)
## [1] NA
var(bwt_copy$birthweight)
## [1] NA
sd(bwt_copy$birthweight)
## [1] NA
#Option 1, modify function specifications
mean(bwt_copy$birthweight, na.rm=TRUE)
## [1] 7.3
#Option 2: use a different package/function that automatically ignores NA. One may also choose to write their own function using the function(x){} command to define this command. See chunk below as an example.
mean_(bwt_copy$birthweight)
## [1] 7.3
var_(bwt_copy$birthweight)
## [1] 1.7575
sd_(bwt_copy$birthweight)
## [1] 1.325707

Calculating observation number for any given variable

Create ‘custom’ function called n_() for sum of non-missing observations to get count:

n_ <- function(x){
  sum_(!is.na(x))
}

We tell R what n_ will do by writing code inside the function(x){} brackets

Now let’s try it out

n_(x = bwt_copy$birthweight) #with NA replacing one of the observations
## [1] 41
n_(x = bwt$birthweight) #without NA replacing one of the observations
## [1] 42

Alternate using psych::describe to give descriptives or just the number of valid observations:

# Of note, the :: operator basically means "use this function specifically from this package." This is because some R packages have duplicated function names and R may become confused or use the wrong one depending on which packages you've loaded in.

describe(bwt$birthweight)
##    vars  n mean   sd median trimmed  mad min max range  skew kurtosis   se
## X1    1 42 7.26 1.33   7.25    7.27 1.19 4.2  10   5.8 -0.05    -0.27 0.21
describe(bwt$birthweight)$n # can put other names after $ instead (median, etc.). You are basically asking for a specific column spit out by the describe command.
## [1] 42

Descriptives for Length

Now let’s get descriptives for babies’ lengths at birth (their heights in inches)

# Plot length 
hist(bwt$length, main = "Histogram of length", xlab = "Length (in inches)")

# Get single descriptives 
mean_(bwt$length)
## [1] 19.92857
var_(bwt$length)
## [1] 1.238676
sd_(bwt$length)
## [1] 1.112958
n_(bwt$length)
## [1] 42

Descriptives for more than one variable at a time using indexing (reminder: format is [row, column])

describe(bwt[,c("birthweight","length")]) #again using the c() vector to call multiple variables/cols
##             vars  n  mean   sd median trimmed  mad  min max range  skew
## birthweight    1 42  7.26 1.33   7.25    7.27 1.19  4.2  10   5.8 -0.05
## length         2 42 19.93 1.11  20.00   19.94 1.48 17.0  22   5.0 -0.17
##             kurtosis   se
## birthweight    -0.27 0.21
## length         -0.23 0.17

I could also loop through the multiple variables using each of the individual hablar function (mean_, var_, sd_, n_) using sapply()

# By using the , we basically say we want to apply this to all rows among the specified variables encased in the c().
sapply(bwt[,c("birthweight","length")], mean_) # means
## birthweight      length 
##    7.264286   19.928571
sapply(bwt[,c("birthweight","length")], var_) # variances
## birthweight      length 
##    1.768206    1.238676
sapply(bwt[,c("birthweight","length")], sd_) # sds
## birthweight      length 
##    1.329739    1.112958
sapply(bwt[,c("birthweight","length")], n_) # ns
## birthweight      length 
##          42          42
# Scatter Plot with base r 
plot(y = bwt$birthweight, 
     x = bwt$gestation,
     xlab = "Gestation (in weeks)",
     ylab = "Birthweight (in pounds)")
# fit a regression line
abline(reg = lm(birthweight ~ gestation, data = bwt, na.action = na.exclude), 
       h = mean_(bwt$birthweight), 
       v = mean_(bwt$gestation),
       col = c("red","black","black"))

# Scatterplot with ggplot 
ggplot(data = bwt, aes(x = gestation, y = birthweight)) +
  geom_point(shape = 1) +
  geom_smooth(method = "lm", formula = "y ~ x", se = F) +
  geom_hline(yintercept = mean_(bwt$birthweight), linetype = "dashed", color = "red") +
  geom_vline(xintercept = mean_(bwt$gestation), linetype = "dashed", color = "red") +
  theme_bw()

This line suggests a positive covariance and correlation.

When length is above its mean, so is birthweight. When length is below its mean, so is birthweight.

We can calculate the covariance using the cov() function… #calculating covariance

cov(x = bwt$gestation, y = bwt$birthweight, use = "pairwise.complete.obs")
## [1] 2.482578
## covariance matrix (with variances on the diagonals)
cov(bwt[,c("birthweight","gestation")], use = "pairwise.complete.obs")
##             birthweight gestation
## birthweight    1.768206  2.482578
## gestation      2.482578  6.987224

#calculating correlation

cor(x = bwt$gestation, y = bwt$birthweight, use = "pairwise.complete.obs")
## [1] 0.7062919
# correlation matrix. A more elaborate alternative you can use if you want to pull out other statistics from the correlation(s) is the cor.test() function.
cor(bwt[,c("birthweight","gestation")], use = "pairwise.complete.obs")
##             birthweight gestation
## birthweight   1.0000000 0.7062919
## gestation     0.7062919 1.0000000

#Calculating r2

(r2 <- cor(x = bwt$gestation, y = bwt$birthweight, use = "pairwise.complete.obs")^2) #the ^2 specifies you want to square it.
## [1] 0.4988483
# as a percentage
r2*100
## [1] 49.88483

These variables share 48.6% of their variance with each other.

Finally, we will calculate some z-scores using the scale() function. Then we will see something interesting.

I will use dplyr piping %>% to make this easier

# The scale function is a way to z-transform variables as long as their class == numeric.
bwt <- bwt %>% 
    mutate(zbirthweight = scale(birthweight)[,1],
           zgestation = scale(gestation)[,1])

If you take the covariance of z-scored variables, it is a correlation (more on that later…). Maybe you can see why if you ask for the full matrix (note the variances of z-scored variables…)

# Covariance
cov(bwt[,c("zbirthweight","zgestation")], use = "pairwise.complete.obs")
##              zbirthweight zgestation
## zbirthweight    1.0000000  0.7062919
## zgestation      0.7062919  1.0000000
# Correlation
cor(bwt[,c("zbirthweight","zgestation")], use = "pairwise.complete.obs")
##              zbirthweight zgestation
## zbirthweight    1.0000000  0.7062919
## zgestation      0.7062919  1.0000000