Installing packages that contain commands we use later in code.
#install.packages("tidyverse")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Practicing basic concepts for data management to learn commands.
#creating variable a, assigning value of 7
a <- 7
#creating variable b, assigning value of 3
b <- 3
#creating variable a, assigning value of 2a+b
c <- (2*a)+b
#view the values for variables a, b, and c
print(a)
## [1] 7
print(b)
## [1] 3
print(c)
## [1] 17
Learning how to override data with new assigned values.
#re-assigning variable a with value of 7
a <- 24
#re-assigning variable b with value of 32
b <- 32
#re-assigning variable c with value of (48/a) - 3*b
c <- (48/a) - (3*b)
#viewing the values for variables a, b, and c
print(a)
## [1] 24
print(b)
## [1] 32
print(c)
## [1] -94
Learning how to create lists and subsets and checking data types
#creating a list of letters from a through e
letters <- list("a", "b", "c", "d", "e")
#checking data type of letters, confirmed to be list of characters
str(letters)
## List of 5
## $ : chr "a"
## $ : chr "b"
## $ : chr "c"
## $ : chr "d"
## $ : chr "e"
#creating subset of letters without chr b called skip_b
skip_b <- letters %>%
#using subset to filter out letters that are not b
subset(letters != "b")
#viewing contents of skip_b, same as letters but no b
print(skip_b)
## [[1]]
## [1] "a"
##
## [[2]]
## [1] "c"
##
## [[3]]
## [1] "d"
##
## [[4]]
## [1] "e"
More practice with lists and learning how to convert data types
#creating a list called numbers, contains 1-10
numbers <- list(1,2,3,4,5,6,7,8,9,10)
#checking data type, numbers list contains numeric data
str(numbers)
## List of 10
## $ : num 1
## $ : num 2
## $ : num 3
## $ : num 4
## $ : num 5
## $ : num 6
## $ : num 7
## $ : num 8
## $ : num 9
## $ : num 10
#assigning values from numbers into new variable numbers_2
numbers_2 <- numbers
#converting numbers_2 to character data type
numbers_2 <- as.character(numbers_2)
#checking data type, converted correctly to character
is.character(numbers_2)
## [1] TRUE
#new list five_plus is assigned values from list numbers
five_plus <- numbers %>%
#using subset to filter out numbers <=4
subset(numbers > 4)
#viewing contents of five_plus
print(five_plus)
## [[1]]
## [1] 5
##
## [[2]]
## [1] 6
##
## [[3]]
## [1] 7
##
## [[4]]
## [1] 8
##
## [[5]]
## [1] 9
##
## [[6]]
## [1] 10
#used numbers to create five_plus
#easier to make numeric comparison of > 4
#more code required to make character comparison
Setting up file path and reading data into a new dataframe.
#setting working directory to where data is
setwd("C:/Users/mdrd1/Desktop/PH141/Assignment #1")
#Creating dataframe hers for HERS data
hers <- read.csv("PH 140 W1 HERS Data.csv")
Exploring variables, observations, and data types within dataset.
#View the first 6 rows
head(hers)
## unique age race smoker diabetes bmi sbp whr
## 1 64 56 African American Not current smoker Diabetic 34.1 120 0.932
## 2 73 77 White Not current smoker Non-diabetic 19.6 129 0.782
## 3 90 63 White Not current smoker Non-diabetic 24.0 130 0.759
## 4 130 61 White Not current smoker Non-diabetic 24.1 118 0.845
## 5 135 63 White Not current smoker Diabetic 30.7 171 0.926
## 6 163 68 White Current smoker Non-diabetic 20.0 144 0.772
#View the last 6 rows
tail(hers)
## unique age race smoker diabetes bmi sbp whr
## 115 2647 74 White Not current smoker Non-diabetic 28.9 122 0.879
## 116 2679 75 White Not current smoker Non-diabetic 28.9 154 0.841
## 117 2683 53 White Not current smoker Diabetic 35.1 132 1.026
## 118 2690 66 White Not current smoker Non-diabetic 32.5 129 0.845
## 119 2748 78 White Not current smoker Non-diabetic 28.9 119 0.791
## 120 2754 70 White Not current smoker Diabetic 28.2 138 0.854
#View the structure of data
str(hers)
## 'data.frame': 120 obs. of 8 variables:
## $ unique : int 64 73 90 130 135 163 204 237 238 273 ...
## $ age : int 56 77 63 61 63 68 78 72 74 69 ...
## $ race : chr "African American" "White" "White" "White" ...
## $ smoker : chr "Not current smoker" "Not current smoker" "Not current smoker" "Not current smoker" ...
## $ diabetes: chr "Diabetic" "Non-diabetic" "Non-diabetic" "Non-diabetic" ...
## $ bmi : num 34.1 19.6 24 24.1 30.7 ...
## $ sbp : int 120 129 130 118 171 144 135 110 164 142 ...
## $ whr : num 0.932 0.782 0.759 0.845 0.926 ...
#View only age variable in hers
str(hers$age)
## int [1:120] 56 77 63 61 63 68 78 72 74 69 ...
Learning to create new variables from existing HERS dataset.
#creating dichotomous variable for age (>65 or <=65)
hers$ageover65 <- ifelse(hers$age>65, 1, 0)
#checking categories
table(hers$ageover65)
##
## 0 1
## 47 73
#creating dichotomous variable for high BP (>=140 or <140)
hers$highbp <- ifelse(hers$sbp>=140, "Yes", "No")
#checking categories
table(hers$highbp)
##
## No Yes
## 82 38
Analyzing certain observations within the large HERS dataset.
#new dataframe with subset of ageover65
over65 <- hers %>%
#filtering out those under 65 and under
subset(hers$ageover65==1)
#new dataframe with subset of highbp
bphigher140 <- hers %>%
#filtering out bp under 140
subset(hers$highbp == "Yes")
Specifying variables to keep in newly created dataframes.
#selecting specific columns for over65 subset
over65_2 <- over65 %>%
#specifying columns
select(unique, age, diabetes, bmi)
##checking to see if columns are correct in first 6 rows
head(over65_2)
## unique age diabetes bmi
## 2 73 77 Non-diabetic 19.6
## 6 163 68 Non-diabetic 20.0
## 7 204 78 Non-diabetic 25.7
## 8 237 72 Diabetic 23.5
## 9 238 74 Diabetic 25.7
## 10 273 69 Non-diabetic 27.9
##checking to see if columns are correct in last 6 rows
tail(over65_2)
## unique age diabetes bmi
## 114 2638 76 Non-diabetic 25.0
## 115 2647 74 Non-diabetic 28.9
## 116 2679 75 Non-diabetic 28.9
## 118 2690 66 Non-diabetic 32.5
## 119 2748 78 Non-diabetic 28.9
## 120 2754 70 Diabetic 28.2
#selecting specific columns for bphigher140 subset
bphigher140_2 <- bphigher140 %>%
#specifying columns
select(unique, age, smoker, sbp)
##checking to see if columns are correct in first 6 rows
head(bphigher140_2)
## unique age smoker sbp
## 5 135 63 Not current smoker 171
## 6 163 68 Current smoker 144
## 9 238 74 Not current smoker 164
## 10 273 69 Not current smoker 142
## 15 409 70 Not current smoker 159
## 16 414 65 Not current smoker 172
##checking to see if columns are correct in last 6 rows
tail(bphigher140_2)
## unique age smoker sbp
## 104 2290 75 Not current smoker 160
## 106 2419 73 Not current smoker 144
## 107 2429 75 Not current smoker 185
## 108 2444 75 Not current smoker 151
## 112 2564 69 Not current smoker 153
## 116 2679 75 Not current smoker 154