# Name: Rachana Raghavendra
# Student ID:921053725

# Learning Module Quiz 3
# PSC 103A 
# University of California, Davis

# Don't forget to comment out any install.packages()
# commands by using # to comment it out,
# so that the report is generated is correctly
# without issues.


## Q1:

setwd("C:/Users/racha/PSC103A")
getwd()
## [1] "C:/Users/racha/PSC103A"
bmiData <- read.table('lab2data.txt', header=T, stringsAsFactors =T)
head(bmiData,10)
##    biosex height_in weight_lbs age_yr   ed_cmplt
## 1    MALE      72.2      146.4   20.0 College2YR
## 2  FEMALE      61.5      103.8   22.7 College4YR
## 3    MALE      68.4      162.7   25.7         MA
## 4    MALE      69.6      151.8   22.9 College4YR
## 5  FEMALE      62.3      117.7   23.4 College4YR
## 6    MALE      67.5      132.1   20.2 College2YR
## 7    MALE      69.9      149.5   20.1 College2YR
## 8  FEMALE      58.9      113.4   23.1 College4YR
## 9  FEMALE      57.9      124.8   30.6        PhD
## 10 FEMALE        NA      111.8   28.2        PhD
str(bmiData)
## 'data.frame':    232 obs. of  5 variables:
##  $ biosex    : Factor w/ 2 levels "FEMALE","MALE": 2 1 2 2 1 2 2 1 1 1 ...
##  $ height_in : num  72.2 61.5 68.4 69.6 62.3 67.5 69.9 58.9 57.9 NA ...
##  $ weight_lbs: num  146 104 163 152 118 ...
##  $ age_yr    : num  20 22.7 25.7 22.9 23.4 20.2 20.1 23.1 30.6 28.2 ...
##  $ ed_cmplt  : Factor w/ 5 levels "College2YR","College4YR",..: 1 2 4 2 2 1 1 2 5 5 ...
class(bmiData$biosex)
## [1] "factor"
## Q2:

##install.packages("Hmisc")

library(Hmisc) # The library function tells R to add the 'Hmisc' package to the library so we can access its function
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
biosex_freq = table(bmiData$biosex) # creates an object called "biosex_freq" that contains the frequencies of each category of the variable biosex
biosex_freq # prints 
## 
## FEMALE   MALE 
##    119    113
educ_freq = table(bmiData$ed_cmplt) # creates an object called "educ_freq" that contains the frequencies of each category of the variable ed_cmplt
educ_freq # prints the
## 
## College2YR College4YR         HS         MA        PhD 
##         51         50         39         55         37
biosex_relfreq = biosex_freq / length(bmiData$biosex) # this step divides the frequency of each category by the total number of observations
biosex_relfreq
## 
##   FEMALE     MALE 
## 0.512931 0.487069
## Q3/4:

describe(bmiData)
## bmiData 
## 
##  5  Variables      232  Observations
## --------------------------------------------------------------------------------
## biosex 
##        n  missing distinct 
##      232        0        2 
##                         
## Value      FEMALE   MALE
## Frequency     119    113
## Proportion  0.513  0.487
## --------------------------------------------------------------------------------
## height_in 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      228        4      120        1    63.78    5.309    57.73    58.17 
##      .25      .50      .75      .90      .95 
##    59.80    63.50    67.82    69.93    70.80 
## 
## lowest : 54   55.9 56.4 56.5 56.9, highest: 72.5 72.7 72.8 73.2 73.4
## --------------------------------------------------------------------------------
## weight_lbs 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      230        2      192        1    131.6    23.07    102.2    105.8 
##      .25      .50      .75      .90      .95 
##    114.7    129.6    149.2    157.9    161.2 
## 
## lowest : 88    94.8  95.2  95.6  95.8 , highest: 166.3 169.9 170.5 171.4 171.8
## --------------------------------------------------------------------------------
## age_yr 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      229        3       79        1    24.23    3.566    19.80    20.00 
##      .25      .50      .75      .90      .95 
##    22.20    24.30    25.50    29.82    30.20 
## 
## lowest : 19.4 19.5 19.6 19.7 19.8, highest: 30.9 31.5 31.6 31.9 32.2
## --------------------------------------------------------------------------------
## ed_cmplt 
##        n  missing distinct 
##      232        0        5 
##                                                                  
## Value      College2YR College4YR         HS         MA        PhD
## Frequency          51         50         39         55         37
## Proportion      0.220      0.216      0.168      0.237      0.159
## --------------------------------------------------------------------------------
## Q5/6:
mean_height = mean(bmiData$height_in, na.rm=TRUE) # calculates the sample mean for the variable height_in and saves it as an object called 'mean_height'; we're saving the mean as an object here because we need it later when creating a boxplot 
mean_height # mean
## [1] 63.77763
median(bmiData$height_in, na.rm=TRUE) # median
## [1] 63.5
boxplot(bmiData$height_in)   # Draw boxplot in Base R
points(x = 1,                # Add the mean value to the boxplot
       y = mean_height,      # this is where we need the object 'mean_height' that we created earlier 
       col = "blue",          # controls the color of the symbol that denotes the mean 
       pch = 8)             # controls the shape of the symbol that denotes the mean

## Q7:

height_MALE = bmiData[bmiData$biosex == "MALE", "height_in"]  # save a subset of height_in values for biosex = MALE

mean_height_MALE = mean(height_MALE, na.rm=TRUE) # create an object that is equal to the mean height for biosex = MALE
mean_height_MALE # print the mean height for biosex=MALE
## [1] 68.0027
length(height_MALE)
## [1] 113
boxplot(height_MALE )   # Draw boxplot in Base R using the subset of data we created
points(x = 1,                # Add the mean value to the boxplot
       y = mean_height_MALE,      # use the 'mean_height_MALE' that we created earlier 
       col = "red",          # controls the color of the symbol that denotes the mean 
       pch = 16)             # controls the shape of the symbol that denotes the mean

mean(height_MALE, trim = .10, na.rm = T) 
## [1] 67.99775
range(bmiData$age_yr, na.rm = TRUE)
## [1] 19.4 32.2
var(bmiData$age_yr, na.rm = TRUE)
## [1] 10.20568
sd(bmiData$age_yr, na.rm = TRUE)
## [1] 3.194632
IQR(bmiData$age_yr, na.rm = TRUE)
## [1] 3.3
MALE_older <- bmiData[ which(bmiData$biosex=='MALE' & bmiData$age_yr > 25), ]
MALE_older
##     biosex height_in weight_lbs age_yr ed_cmplt
## 3     MALE      68.4      162.7   25.7       MA
## 20    MALE      65.2      169.9   25.3       HS
## 25    MALE      68.7      171.4   25.4       MA
## 33    MALE      68.7      146.3   30.2      PhD
## 44    MALE      69.3      145.7   25.1       MA
## 45    MALE      66.7      143.6   31.9      PhD
## 63    MALE      69.5      144.9   25.4       HS
## 66    MALE      68.6      141.2   25.9       MA
## 67    MALE      70.4      158.3   27.4      PhD
## 76    MALE      64.7      145.6   29.7      PhD
## 81    MALE      65.2      132.9   25.8       MA
## 87    MALE      70.8      159.5   28.3      PhD
## 92    MALE      67.8      165.7   25.4       HS
## 103   MALE      66.6      160.0   25.7       HS
## 106   MALE      63.7      151.5   25.3       HS
## 107   MALE        NA      145.5   26.8      PhD
## 111   MALE      69.9      123.1   25.3       MA
## 113   MALE      70.8      159.4   29.9      PhD
## 114   MALE      68.6      150.9   25.8       HS
## 120   MALE      66.6      149.3   30.2      PhD
## 122   MALE      63.7      151.2   25.2       MA
## 123   MALE      62.8      156.4   29.3      PhD
## 125   MALE      70.9      160.5   25.6       HS
## 132   MALE      67.2      154.3   25.7       HS
## 139   MALE      66.9      153.6   25.2       MA
## 145   MALE      67.4      118.0   28.7      PhD
## 146   MALE      68.4      141.6   30.9      PhD
## 149   MALE      65.5      145.6   27.3       HS
## 154   MALE      68.7      148.0   29.9      PhD
## 159   MALE      68.3      146.7   30.0      PhD
## 163   MALE      68.3      161.8   29.9      PhD
## 171   MALE      68.0      157.9   28.5      PhD
## 178   MALE      68.2      157.9   25.4       HS
## 180   MALE      67.8      147.0   31.5      PhD
## 183   MALE      69.8      148.1   30.3      PhD
## 193   MALE      68.3      151.9   30.0      PhD
## 197   MALE      71.7      157.2   25.4       HS
## 198   MALE      65.7      150.8   25.1       HS
## 205   MALE      67.5      153.5   25.7       MA
## 206   MALE      67.7      137.5   26.4       HS
## 210   MALE      67.7      149.1   30.3      PhD
## 218   MALE      66.7      170.5   25.5       HS
## 227   MALE      67.7      140.4   25.8       MA
## 228   MALE      66.1      156.9   25.1       MA
## 229   MALE        NA      159.1   25.4       MA
## 230   MALE      69.4      162.0   30.0      PhD
summary(MALE_older) 
##     biosex     height_in       weight_lbs        age_yr            ed_cmplt 
##  FEMALE: 0   Min.   :62.80   Min.   :118.0   Min.   :25.10   College2YR: 0  
##  MALE  :46   1st Qu.:66.67   1st Qu.:145.6   1st Qu.:25.40   College4YR: 0  
##              Median :67.90   Median :151.3   Median :25.85   HS        :14  
##              Mean   :67.74   Mean   :151.4   Mean   :27.36   MA        :12  
##              3rd Qu.:68.70   3rd Qu.:158.9   3rd Qu.:29.90   PhD       :20  
##              Max.   :71.70   Max.   :171.4   Max.   :31.90                  
##              NA's   :2
## Q7,8/9

mean_weight_MALE_older = mean(MALE_older$weight_lb, na.rm=TRUE) 
mean_weight_MALE_older 
## [1] 151.4109
median(MALE_older$weight_lb, na.rm=TRUE) 
## [1] 151.35
## [1] 151.35
mean(MALE_older$weight_lb, trim = .10, na.rm = T) 
## [1] 151.9974
## [1] 151.9974
range(MALE_older$weight_lb, na.rm=TRUE) # range (gives the minimum and maximum values)
## [1] 118.0 171.4
## [1] 118.0 171.4
var(MALE_older$weight_lb, na.rm=TRUE)  # variance
## [1] 119.0059
## [1] 119.0059
sd(MALE_older$weight_lb, na.rm=TRUE)   # standard deviation 
## [1] 10.90898
## [1] 10.90898
iqr_x= IQR(MALE_older$weight_lb, na.rm=TRUE) # interquartile range


boxplot(MALE_older$weight_lb )   
points(x = 1,               
       y = mean_weight_MALE_older,      
       col = "red",           
       pch = 16)            

hist(MALE_older$weight_lb,
     main = "Males older than 25 years",
     col = 4,
     xlab = "Weight (lb)")  

# Q11 - Upload this R script (.R) to Canvas
# with your code.