# Name: Rachana Raghavendra
# Student ID:921053725
# Learning Module Quiz 3
# PSC 103A
# University of California, Davis
# Don't forget to comment out any install.packages()
# commands by using # to comment it out,
# so that the report is generated is correctly
# without issues.
## Q1:
setwd("C:/Users/racha/PSC103A")
getwd()
## [1] "C:/Users/racha/PSC103A"
bmiData <- read.table('lab2data.txt', header=T, stringsAsFactors =T)
head(bmiData,10)
## biosex height_in weight_lbs age_yr ed_cmplt
## 1 MALE 72.2 146.4 20.0 College2YR
## 2 FEMALE 61.5 103.8 22.7 College4YR
## 3 MALE 68.4 162.7 25.7 MA
## 4 MALE 69.6 151.8 22.9 College4YR
## 5 FEMALE 62.3 117.7 23.4 College4YR
## 6 MALE 67.5 132.1 20.2 College2YR
## 7 MALE 69.9 149.5 20.1 College2YR
## 8 FEMALE 58.9 113.4 23.1 College4YR
## 9 FEMALE 57.9 124.8 30.6 PhD
## 10 FEMALE NA 111.8 28.2 PhD
str(bmiData)
## 'data.frame': 232 obs. of 5 variables:
## $ biosex : Factor w/ 2 levels "FEMALE","MALE": 2 1 2 2 1 2 2 1 1 1 ...
## $ height_in : num 72.2 61.5 68.4 69.6 62.3 67.5 69.9 58.9 57.9 NA ...
## $ weight_lbs: num 146 104 163 152 118 ...
## $ age_yr : num 20 22.7 25.7 22.9 23.4 20.2 20.1 23.1 30.6 28.2 ...
## $ ed_cmplt : Factor w/ 5 levels "College2YR","College4YR",..: 1 2 4 2 2 1 1 2 5 5 ...
class(bmiData$biosex)
## [1] "factor"
## Q2:
##install.packages("Hmisc")
library(Hmisc) # The library function tells R to add the 'Hmisc' package to the library so we can access its function
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
biosex_freq = table(bmiData$biosex) # creates an object called "biosex_freq" that contains the frequencies of each category of the variable biosex
biosex_freq # prints
##
## FEMALE MALE
## 119 113
educ_freq = table(bmiData$ed_cmplt) # creates an object called "educ_freq" that contains the frequencies of each category of the variable ed_cmplt
educ_freq # prints the
##
## College2YR College4YR HS MA PhD
## 51 50 39 55 37
biosex_relfreq = biosex_freq / length(bmiData$biosex) # this step divides the frequency of each category by the total number of observations
biosex_relfreq
##
## FEMALE MALE
## 0.512931 0.487069
## Q3/4:
describe(bmiData)
## bmiData
##
## 5 Variables 232 Observations
## --------------------------------------------------------------------------------
## biosex
## n missing distinct
## 232 0 2
##
## Value FEMALE MALE
## Frequency 119 113
## Proportion 0.513 0.487
## --------------------------------------------------------------------------------
## height_in
## n missing distinct Info Mean Gmd .05 .10
## 228 4 120 1 63.78 5.309 57.73 58.17
## .25 .50 .75 .90 .95
## 59.80 63.50 67.82 69.93 70.80
##
## lowest : 54 55.9 56.4 56.5 56.9, highest: 72.5 72.7 72.8 73.2 73.4
## --------------------------------------------------------------------------------
## weight_lbs
## n missing distinct Info Mean Gmd .05 .10
## 230 2 192 1 131.6 23.07 102.2 105.8
## .25 .50 .75 .90 .95
## 114.7 129.6 149.2 157.9 161.2
##
## lowest : 88 94.8 95.2 95.6 95.8 , highest: 166.3 169.9 170.5 171.4 171.8
## --------------------------------------------------------------------------------
## age_yr
## n missing distinct Info Mean Gmd .05 .10
## 229 3 79 1 24.23 3.566 19.80 20.00
## .25 .50 .75 .90 .95
## 22.20 24.30 25.50 29.82 30.20
##
## lowest : 19.4 19.5 19.6 19.7 19.8, highest: 30.9 31.5 31.6 31.9 32.2
## --------------------------------------------------------------------------------
## ed_cmplt
## n missing distinct
## 232 0 5
##
## Value College2YR College4YR HS MA PhD
## Frequency 51 50 39 55 37
## Proportion 0.220 0.216 0.168 0.237 0.159
## --------------------------------------------------------------------------------
## Q5/6:
mean_height = mean(bmiData$height_in, na.rm=TRUE) # calculates the sample mean for the variable height_in and saves it as an object called 'mean_height'; we're saving the mean as an object here because we need it later when creating a boxplot
mean_height # mean
## [1] 63.77763
median(bmiData$height_in, na.rm=TRUE) # median
## [1] 63.5
boxplot(bmiData$height_in) # Draw boxplot in Base R
points(x = 1, # Add the mean value to the boxplot
y = mean_height, # this is where we need the object 'mean_height' that we created earlier
col = "blue", # controls the color of the symbol that denotes the mean
pch = 8) # controls the shape of the symbol that denotes the mean

## Q7:
height_MALE = bmiData[bmiData$biosex == "MALE", "height_in"] # save a subset of height_in values for biosex = MALE
mean_height_MALE = mean(height_MALE, na.rm=TRUE) # create an object that is equal to the mean height for biosex = MALE
mean_height_MALE # print the mean height for biosex=MALE
## [1] 68.0027
length(height_MALE)
## [1] 113
boxplot(height_MALE ) # Draw boxplot in Base R using the subset of data we created
points(x = 1, # Add the mean value to the boxplot
y = mean_height_MALE, # use the 'mean_height_MALE' that we created earlier
col = "red", # controls the color of the symbol that denotes the mean
pch = 16) # controls the shape of the symbol that denotes the mean

mean(height_MALE, trim = .10, na.rm = T)
## [1] 67.99775
range(bmiData$age_yr, na.rm = TRUE)
## [1] 19.4 32.2
var(bmiData$age_yr, na.rm = TRUE)
## [1] 10.20568
sd(bmiData$age_yr, na.rm = TRUE)
## [1] 3.194632
IQR(bmiData$age_yr, na.rm = TRUE)
## [1] 3.3
MALE_older <- bmiData[ which(bmiData$biosex=='MALE' & bmiData$age_yr > 25), ]
MALE_older
## biosex height_in weight_lbs age_yr ed_cmplt
## 3 MALE 68.4 162.7 25.7 MA
## 20 MALE 65.2 169.9 25.3 HS
## 25 MALE 68.7 171.4 25.4 MA
## 33 MALE 68.7 146.3 30.2 PhD
## 44 MALE 69.3 145.7 25.1 MA
## 45 MALE 66.7 143.6 31.9 PhD
## 63 MALE 69.5 144.9 25.4 HS
## 66 MALE 68.6 141.2 25.9 MA
## 67 MALE 70.4 158.3 27.4 PhD
## 76 MALE 64.7 145.6 29.7 PhD
## 81 MALE 65.2 132.9 25.8 MA
## 87 MALE 70.8 159.5 28.3 PhD
## 92 MALE 67.8 165.7 25.4 HS
## 103 MALE 66.6 160.0 25.7 HS
## 106 MALE 63.7 151.5 25.3 HS
## 107 MALE NA 145.5 26.8 PhD
## 111 MALE 69.9 123.1 25.3 MA
## 113 MALE 70.8 159.4 29.9 PhD
## 114 MALE 68.6 150.9 25.8 HS
## 120 MALE 66.6 149.3 30.2 PhD
## 122 MALE 63.7 151.2 25.2 MA
## 123 MALE 62.8 156.4 29.3 PhD
## 125 MALE 70.9 160.5 25.6 HS
## 132 MALE 67.2 154.3 25.7 HS
## 139 MALE 66.9 153.6 25.2 MA
## 145 MALE 67.4 118.0 28.7 PhD
## 146 MALE 68.4 141.6 30.9 PhD
## 149 MALE 65.5 145.6 27.3 HS
## 154 MALE 68.7 148.0 29.9 PhD
## 159 MALE 68.3 146.7 30.0 PhD
## 163 MALE 68.3 161.8 29.9 PhD
## 171 MALE 68.0 157.9 28.5 PhD
## 178 MALE 68.2 157.9 25.4 HS
## 180 MALE 67.8 147.0 31.5 PhD
## 183 MALE 69.8 148.1 30.3 PhD
## 193 MALE 68.3 151.9 30.0 PhD
## 197 MALE 71.7 157.2 25.4 HS
## 198 MALE 65.7 150.8 25.1 HS
## 205 MALE 67.5 153.5 25.7 MA
## 206 MALE 67.7 137.5 26.4 HS
## 210 MALE 67.7 149.1 30.3 PhD
## 218 MALE 66.7 170.5 25.5 HS
## 227 MALE 67.7 140.4 25.8 MA
## 228 MALE 66.1 156.9 25.1 MA
## 229 MALE NA 159.1 25.4 MA
## 230 MALE 69.4 162.0 30.0 PhD
summary(MALE_older)
## biosex height_in weight_lbs age_yr ed_cmplt
## FEMALE: 0 Min. :62.80 Min. :118.0 Min. :25.10 College2YR: 0
## MALE :46 1st Qu.:66.67 1st Qu.:145.6 1st Qu.:25.40 College4YR: 0
## Median :67.90 Median :151.3 Median :25.85 HS :14
## Mean :67.74 Mean :151.4 Mean :27.36 MA :12
## 3rd Qu.:68.70 3rd Qu.:158.9 3rd Qu.:29.90 PhD :20
## Max. :71.70 Max. :171.4 Max. :31.90
## NA's :2
## Q7,8/9
mean_weight_MALE_older = mean(MALE_older$weight_lb, na.rm=TRUE)
mean_weight_MALE_older
## [1] 151.4109
median(MALE_older$weight_lb, na.rm=TRUE)
## [1] 151.35
## [1] 151.35
mean(MALE_older$weight_lb, trim = .10, na.rm = T)
## [1] 151.9974
## [1] 151.9974
range(MALE_older$weight_lb, na.rm=TRUE) # range (gives the minimum and maximum values)
## [1] 118.0 171.4
## [1] 118.0 171.4
var(MALE_older$weight_lb, na.rm=TRUE) # variance
## [1] 119.0059
## [1] 119.0059
sd(MALE_older$weight_lb, na.rm=TRUE) # standard deviation
## [1] 10.90898
## [1] 10.90898
iqr_x= IQR(MALE_older$weight_lb, na.rm=TRUE) # interquartile range
boxplot(MALE_older$weight_lb )
points(x = 1,
y = mean_weight_MALE_older,
col = "red",
pch = 16)

hist(MALE_older$weight_lb,
main = "Males older than 25 years",
col = 4,
xlab = "Weight (lb)")

# Q11 - Upload this R script (.R) to Canvas
# with your code.