Q1: Load packages
# 1. Install required packages if you haven't installed them yet
# 2.Load the required packages with the library() function. You can also use the pacman package to load multiple at a time with pacman::pload().
library("summarytools")
library("psych")
library("ggplot2")
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
Q2: Set working directory and read in data
## [1] "C:\\Users\\dcg5438\\Desktop\\BBH 505\\Gilliland_Lab1_597.Rmd"
# 3. Read in data using read.csv(). The .csv file is called data_R1.csv. We use the assignment operator <- to denote we want a variable name to be represented by what is contained in the .csv file.
lab1_dat <- read.csv("C:\\Users\\dcg5438\\Desktop\\data_R1.csv")
Q3: Ask for variable names and data structure
# 1. Use colnames() with the variable name encased in the parentheses we defined above to get the variable names.
colnames(lab1_dat)
## [1] "age" "height" "weight"
# 2. Use the str() function with the variable name encased in the parentheses we defined above to get the data structure (i.e., number of variables and observations).
str(lab1_dat)
## 'data.frame': 10 obs. of 3 variables:
## $ age : int 8 8 9 7 8 7 8 8 9 8
## $ height: int 45 47 49 44 45 43 44 45 47 52
## $ weight: int 57 58 62 51 60 51 55 58 64 65
Q4: Print the first 6 rows and last 6 rows of the data, and all of
the data
# 1. The head(varname, n = 6) function is a good way to view the first six rows of the data.
head(lab1_dat, n = 6)
8 |
45 |
57 |
8 |
47 |
58 |
9 |
49 |
62 |
7 |
44 |
51 |
8 |
45 |
60 |
7 |
43 |
51 |
# 2. The tail() function following the same format is a good way to view the last six rows of the data.
tail(lab1_dat, n = 6)
5 |
8 |
45 |
60 |
6 |
7 |
43 |
51 |
7 |
8 |
44 |
55 |
8 |
8 |
45 |
58 |
9 |
9 |
47 |
64 |
10 |
8 |
52 |
65 |
# 3. To view the entire dataset, you simply enter the dataset name we defined above.
lab1_dat
8 |
45 |
57 |
8 |
47 |
58 |
9 |
49 |
62 |
7 |
44 |
51 |
8 |
45 |
60 |
7 |
43 |
51 |
8 |
44 |
55 |
8 |
45 |
58 |
9 |
47 |
64 |
8 |
52 |
65 |
Q5: Ask for frequencies for age and height
# 1. To ask for frequencies, use the freq() function with the dataset name. To ask for a specific variable, we use a $ sign. For example, dataframe$variablename.
library("summarytools")
freq(lab1_dat$age)
## Frequencies
## lab1_dat$age
## Type: Integer
##
## Freq % Valid % Valid Cum. % Total % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
## 7 2 20.00 20.00 20.00 20.00
## 8 6 60.00 80.00 60.00 80.00
## 9 2 20.00 100.00 20.00 100.00
## <NA> 0 0.00 100.00
## Total 10 100.00 100.00 100.00 100.00
## Frequencies
## lab1_dat$height
## Type: Integer
##
## Freq % Valid % Valid Cum. % Total % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
## 43 1 10.00 10.00 10.00 10.00
## 44 2 20.00 30.00 20.00 30.00
## 45 3 30.00 60.00 30.00 60.00
## 47 2 20.00 80.00 20.00 80.00
## 49 1 10.00 90.00 10.00 90.00
## 52 1 10.00 100.00 10.00 100.00
## <NA> 0 0.00 100.00
## Total 10 100.00 100.00 100.00 100.00
## Frequencies
## lab1_dat$weight
## Type: Integer
##
## Freq % Valid % Valid Cum. % Total % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
## 51 2 20.00 20.00 20.00 20.00
## 55 1 10.00 30.00 10.00 30.00
## 57 1 10.00 40.00 10.00 40.00
## 58 2 20.00 60.00 20.00 60.00
## 60 1 10.00 70.00 10.00 70.00
## 62 1 10.00 80.00 10.00 80.00
## 64 1 10.00 90.00 10.00 90.00
## 65 1 10.00 100.00 10.00 100.00
## <NA> 0 0.00 100.00
## Total 10 100.00 100.00 100.00 100.00
Q6: Recode height into groups
# 1. To recode into groups, we use the ifelse() command. This is organized such that ifelse(condition for variable, iftrue, iffalse). For example, if I wanted to organized age into above 18 and below 18, we would do ifelse(dataframe$age > 18, 1, 0) where 1 == above 18, 0 == below 18.
lab1_dat$height_grp <- ifelse(lab1_dat$height < 47, 1, 2) #ifelse(condition, true, false)
class(lab1_dat$height_grp)
## [1] "numeric"
lab1_dat$height_grp <- factor(lab1_dat$height_grp, levels = c(1,2), labels = c("Short", "Tall"))
ctable(lab1_dat$height_grp, lab1_dat$height)
## Cross-Tabulation, Row Proportions
## height_grp * height
## Data Frame: lab1_dat
##
## ------------ -------- ----------- ----------- ----------- ----------- ----------- ----------- -------------
## height 43 44 45 47 49 52 Total
## height_grp
## Short 1 (16.7%) 2 (33.3%) 3 (50.0%) 0 ( 0.0%) 0 ( 0.0%) 0 ( 0.0%) 6 (100.0%)
## Tall 0 ( 0.0%) 0 ( 0.0%) 0 ( 0.0%) 2 (50.0%) 1 (25.0%) 1 (25.0%) 4 (100.0%)
## Total 1 (10.0%) 2 (20.0%) 3 (30.0%) 2 (20.0%) 1 (10.0%) 1 (10.0%) 10 (100.0%)
## ------------ -------- ----------- ----------- ----------- ----------- ----------- ----------- -------------
Q7: Plots
# ggplot2 is the desired function for making plots because they are highly customizable. For now, I give you the script to make these plots. They are organized as follows: ggplot(dataframe, aes(x = xvariable, y = yvariable [if applicable])) + [any more details are added with a separate + sign] geom_{bar, histogram, line} for the type of plot you want to make (e.g., a bar plot would be geom_bar(fill = "color", color = "line color", width = some number, typically 0.5-2.5). For histograms, width is instead binwidth as we are binning our data. + labs() specifies the names of the axes we denote using strings like "X Axis" + theme_bw() is an easy way to make the plots look neat and clean.
# 1. Bar plot
library(ggplot2)
ggplot(lab1_dat, aes(x = height_grp)) +
geom_bar(fill = "gray", color = "black", width = 0.5) +
labs(title= "Bar Plot", x = "Height group", y = "Count") +
theme_bw()

# 2. Histogram
ggplot(lab1_dat, aes(x = height)) +
geom_histogram(fill = "gray", color = "black", binwidth = 2.5) +
labs(title= "Histogram", x = "Height (inch)", y = "Count") +
theme_bw()

# 3. Histogram with a density curve
ggplot(lab1_dat, aes(x = height)) +
geom_histogram(aes(y = after_stat(density)), fill = "gray", color = "black", binwidth = 2.5) +
stat_function(fun = dnorm, args = list(mean = mean(lab1_dat$height, na.rm = T),
sd = sd(lab1_dat$height, na.rm = T)),
color = "blue", linewidth = 1) +
labs(title="Histogram with Density Curve", x = "Height (inch)", y = "Density") +
scale_x_continuous(limits = c(40, 55)) +
theme_bw()
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

# 4. Line plot
ggplot(lab1_dat, aes(x = height)) +
geom_line(stat = "count") +
labs(title= "Line Plot", x = "Height (lb)", y = "Count") +
scale_x_continuous(limits = c(40, 55)) +
theme_bw()
