Q1: Load packages

# 1. Install required packages if you haven't installed them yet


# 2.Load the required packages with the library() function. You can also use the pacman package to load multiple at a time with pacman::pload().
library("summarytools")
library("psych")
library("ggplot2")
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha

Q2: Set working directory and read in data

file.choose()
## [1] "C:\\Users\\dcg5438\\Desktop\\BBH 505\\Gilliland_Lab1_597.Rmd"
# 3. Read in data using read.csv(). The .csv file is called data_R1.csv. We use the assignment operator <- to denote we want a variable name to be represented by what is contained in the .csv file.
lab1_dat <- read.csv("C:\\Users\\dcg5438\\Desktop\\data_R1.csv")

Q3: Ask for variable names and data structure

# 1. Use colnames() with the variable name encased in the parentheses we defined above to get the variable names.

colnames(lab1_dat)
## [1] "age"    "height" "weight"
# 2. Use the str() function with the variable name encased in the parentheses we defined above to get the data structure (i.e., number of variables and observations).

str(lab1_dat)
## 'data.frame':    10 obs. of  3 variables:
##  $ age   : int  8 8 9 7 8 7 8 8 9 8
##  $ height: int  45 47 49 44 45 43 44 45 47 52
##  $ weight: int  57 58 62 51 60 51 55 58 64 65

Q4: Print the first 6 rows and last 6 rows of the data, and all of the data

# 1. The head(varname, n = 6) function is a good way to view the first six rows of the data.

head(lab1_dat, n = 6)
age height weight
8 45 57
8 47 58
9 49 62
7 44 51
8 45 60
7 43 51
# 2. The tail() function following the same format is a good way to view the last six rows of the data.

tail(lab1_dat, n = 6)
age height weight
5 8 45 60
6 7 43 51
7 8 44 55
8 8 45 58
9 9 47 64
10 8 52 65
# 3. To view the entire dataset, you simply enter the dataset name we defined above.

lab1_dat
age height weight
8 45 57
8 47 58
9 49 62
7 44 51
8 45 60
7 43 51
8 44 55
8 45 58
9 47 64
8 52 65

Q5: Ask for frequencies for age and height

# 1. To ask for frequencies, use the freq() function with the dataset name. To ask for a specific variable, we use a $ sign. For example, dataframe$variablename.

library("summarytools")
freq(lab1_dat$age)
## Frequencies  
## lab1_dat$age  
## Type: Integer  
## 
##               Freq   % Valid   % Valid Cum.   % Total   % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
##           7      2     20.00          20.00     20.00          20.00
##           8      6     60.00          80.00     60.00          80.00
##           9      2     20.00         100.00     20.00         100.00
##        <NA>      0                               0.00         100.00
##       Total     10    100.00         100.00    100.00         100.00
freq(lab1_dat$height)
## Frequencies  
## lab1_dat$height  
## Type: Integer  
## 
##               Freq   % Valid   % Valid Cum.   % Total   % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
##          43      1     10.00          10.00     10.00          10.00
##          44      2     20.00          30.00     20.00          30.00
##          45      3     30.00          60.00     30.00          60.00
##          47      2     20.00          80.00     20.00          80.00
##          49      1     10.00          90.00     10.00          90.00
##          52      1     10.00         100.00     10.00         100.00
##        <NA>      0                               0.00         100.00
##       Total     10    100.00         100.00    100.00         100.00
freq(lab1_dat$weight)
## Frequencies  
## lab1_dat$weight  
## Type: Integer  
## 
##               Freq   % Valid   % Valid Cum.   % Total   % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
##          51      2     20.00          20.00     20.00          20.00
##          55      1     10.00          30.00     10.00          30.00
##          57      1     10.00          40.00     10.00          40.00
##          58      2     20.00          60.00     20.00          60.00
##          60      1     10.00          70.00     10.00          70.00
##          62      1     10.00          80.00     10.00          80.00
##          64      1     10.00          90.00     10.00          90.00
##          65      1     10.00         100.00     10.00         100.00
##        <NA>      0                               0.00         100.00
##       Total     10    100.00         100.00    100.00         100.00

Q6: Recode height into groups

# 1. To recode into groups, we use the ifelse() command. This is organized such that ifelse(condition for variable, iftrue, iffalse). For example, if I wanted to organized age into above 18 and below 18, we would do ifelse(dataframe$age > 18, 1, 0) where 1 == above 18, 0 == below 18.

lab1_dat$height_grp <- ifelse(lab1_dat$height < 47, 1, 2) #ifelse(condition, true, false)
class(lab1_dat$height_grp)
## [1] "numeric"
lab1_dat$height_grp <- factor(lab1_dat$height_grp, levels = c(1,2), labels = c("Short", "Tall"))

ctable(lab1_dat$height_grp, lab1_dat$height)
## Cross-Tabulation, Row Proportions  
## height_grp * height  
## Data Frame: lab1_dat  
## 
## ------------ -------- ----------- ----------- ----------- ----------- ----------- ----------- -------------
##                height          43          44          45          47          49          52         Total
##   height_grp                                                                                               
##        Short            1 (16.7%)   2 (33.3%)   3 (50.0%)   0 ( 0.0%)   0 ( 0.0%)   0 ( 0.0%)    6 (100.0%)
##         Tall            0 ( 0.0%)   0 ( 0.0%)   0 ( 0.0%)   2 (50.0%)   1 (25.0%)   1 (25.0%)    4 (100.0%)
##        Total            1 (10.0%)   2 (20.0%)   3 (30.0%)   2 (20.0%)   1 (10.0%)   1 (10.0%)   10 (100.0%)
## ------------ -------- ----------- ----------- ----------- ----------- ----------- ----------- -------------

Q7: Plots

# ggplot2 is the desired function for making plots because they are highly customizable. For now, I give you the script to make these plots. They are organized as follows: ggplot(dataframe, aes(x = xvariable, y = yvariable [if applicable])) + [any more details are added with a separate + sign] geom_{bar, histogram, line} for the type of plot you want to make (e.g., a bar plot would be geom_bar(fill = "color", color = "line color", width = some number, typically 0.5-2.5). For histograms, width is instead binwidth as we are binning our data. + labs() specifies the names of the axes we denote using strings like "X Axis" + theme_bw() is an easy way to make the plots look neat and clean.

# 1. Bar plot
library(ggplot2)
ggplot(lab1_dat, aes(x = height_grp)) +
  geom_bar(fill = "gray", color = "black", width = 0.5) +
  labs(title= "Bar Plot", x = "Height group", y = "Count") +
  theme_bw()

# 2. Histogram
ggplot(lab1_dat, aes(x = height)) +
  geom_histogram(fill = "gray", color = "black", binwidth = 2.5) +
  labs(title= "Histogram", x = "Height (inch)", y = "Count") +
  theme_bw()

# 3. Histogram with a density curve
ggplot(lab1_dat, aes(x = height)) +
  geom_histogram(aes(y = after_stat(density)), fill = "gray", color = "black", binwidth = 2.5) +
  stat_function(fun = dnorm, args = list(mean = mean(lab1_dat$height, na.rm = T),
                                         sd = sd(lab1_dat$height, na.rm = T)),
                color = "blue", linewidth = 1) + 
  labs(title="Histogram with Density Curve", x = "Height (inch)", y = "Density") +
  scale_x_continuous(limits = c(40, 55)) +
  theme_bw()
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

# 4. Line plot
ggplot(lab1_dat, aes(x = height)) +
  geom_line(stat = "count") +
  labs(title= "Line Plot", x = "Height (lb)", y = "Count") +
  scale_x_continuous(limits = c(40, 55)) +
  theme_bw()