Lecture 2

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

##Exercise 1

################### EXERCISE ########################################

source("http://www.openintro.org/stat/data/cdc.R")
## Exercise: Let us see some other numerical calulations.
output <- "[1]Create a numerical summary for height and compute the interquartile range"
cat("\n",output,"\n")

## 
##  [1]Create a numerical summary for height and compute the interquartile range

summary(cdc$height)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   48.00   64.00   67.00   67.18   70.00   93.00

output <- "[2]Create a numerical summary for age and compute the interquartile range"
cat("\n",output,"\n")

## 
##  [2]Create a numerical summary for age and compute the interquartile range

summary(cdc$age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   18.00   31.00   43.00   45.07   57.00   99.00

## Compute the relative frequency distribution for gender and exerany. 
numRows <- nrow(cdc)
output <- "[3]Relative frequency distribution for gender:"
cat("\n",output,"\n")

## 
##  [3]Relative frequency distribution for gender:

table(cdc$gender)/numRows

## 
##       m       f 
## 0.47845 0.52155

output <- "[4]Relative frequency distribution for exerany:"
cat("\n",output,"\n")

## 
##  [4]Relative frequency distribution for exerany:

table(cdc$exerany)/numRows

## 
##      0      1 
## 0.2543 0.7457

output <- "[5]How many males are in the sample?" 
cat("\n",output,"\n")

## 
##  [5]How many males are in the sample?

table(cdc$gender)

## 
##     m     f 
##  9569 10431

output <- "[6]What proportion of the sample reports being in excellent health?"
cat("\n",output,"\n")

## 
##  [6]What proportion of the sample reports being in excellent health?

table(cdc$genhlth)/nrow(cdc)

## 
## excellent very good      good      fair      poor 
##   0.23285   0.34860   0.28375   0.10095   0.03385

output <- "[7]HOW DO WE Create a new object called under23_and_smoke that contains all observations of respondents under the age of 23 that have smoked 100 cigarettes in their lifetime. Write the command you used to create the new object as the answer to this exercise.
"
cat("\n",output,"\n")

## 
##  [7]HOW DO WE Create a new object called under23_and_smoke that contains all observations of respondents under the age of 23 that have smoked 100 cigarettes in their lifetime. Write the command you used to create the new object as the answer to this exercise.
##

under23_and_smoke <- subset(cdc,age<23 & smoke100==1)
summary(under23_and_smoke)

##       genhlth       exerany          hlthplan         smoke100     height     
##  excellent:110   Min.   :0.0000   Min.   :0.0000   Min.   :1   Min.   :59.00  
##  very good:244   1st Qu.:1.0000   1st Qu.:0.0000   1st Qu.:1   1st Qu.:65.00  
##  good     :204   Median :1.0000   Median :1.0000   Median :1   Median :68.00  
##  fair     : 53   Mean   :0.8145   Mean   :0.6952   Mean   :1   Mean   :67.92  
##  poor     :  9   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1   3rd Qu.:71.00  
##                  Max.   :1.0000   Max.   :1.0000   Max.   :1   Max.   :79.00  
##      weight         wtdesire          age        gender 
##  Min.   : 85.0   Min.   : 80.0   Min.   :18.00   m:305  
##  1st Qu.:130.0   1st Qu.:125.0   1st Qu.:19.00   f:315  
##  Median :155.0   Median :150.0   Median :20.00          
##  Mean   :158.9   Mean   :152.2   Mean   :20.22          
##  3rd Qu.:180.0   3rd Qu.:175.0   3rd Qu.:21.00          
##  Max.   :350.0   Max.   :315.0   Max.   :22.00

## 
##  I choose gender as another categorical variable to identify relationship with BMI

## 
##  In the plot, I see that Female seems to have lower BMI as compared to Males

###  NOW ON YOUR OWN....
output <- "[1]Make a scatterplot of weight versus desired weight. Describe the relationship between these two variables."
cat("\n",output,"\n")

## 
##  [1]Make a scatterplot of weight versus desired weight. Describe the relationship between these two variables.

plot( cdc$weight , cdc$wtdesire  )

corr <- cor(cdc$weight , cdc$wtdesire )

output <- "Weight and wtdesired are positvely correlated. As the weight increases the desired weight also increases."

cat("\n",output,"correlation is:", corr, "\n")

## 
##  Weight and wtdesired are positvely correlated. As the weight increases the desired weight also increases. correlation is: 0.8000521

#.  Let's consider a new variable: the difference between desired weight (wtdesire) and current weight (weight).
#   Create this new variable by subtracting the two columns in the data frame and assigning them to a new object called wdiff.
wdiff = cdc$wtdesire - cdc$weight 
#.  What type of data is wdiff? If an observation wdiff is 0, what does this mean about the person's weight and desired weight.
#   What if wdiff is positive or negative?
output <- "[2]What type of data is wdiff"
cat("\n",output,"\n")

## 
##  [2]What type of data is wdiff

str(wdiff)

##  int [1:20000] 0 -10 0 -8 -20 0 -9 -10 -20 -10 ...

output <- "If wdiff is ), that means the person is at an ideal weight. if wdiff is positive that means person is underweight and if wdiff is negative that means person is overweight."
cat("\n",output,"\n")

## 
##  If wdiff is ), that means the person is at an ideal weight. if wdiff is positive that means person is underweight and if wdiff is negative that means person is overweight.

#.  Describe the distribution of wdiff in terms of its center, shape, and spread, including any plots you use.
summary(wdiff)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -300.00  -21.00  -10.00  -14.59    0.00  500.00

output <- "wdiff is left skewed."
cat("\n", "Center is : ", mean(wdiff))

## 
##  Center is :  -14.5891

cat("\n", "Shape is :", output)

## 
##  Shape is : wdiff is left skewed.

hist(wdiff,breaks=20,las=1)

cat("\n", "Spread is :",range(wdiff))

## 
##  Spread is : -300 500

cat("\n", "Interquartile range is :",IQR(wdiff))

## 
##  Interquartile range is : 21

output <- "What does this tell us about how people feel about their current weight?: "
output1 <- "Majority of the people feel that they are overweight by few pounds between 0 and 21 pounds"
cat("\n",output,output1,"\n")

## 
##  What does this tell us about how people feel about their current weight?:  Majority of the people feel that they are overweight by few pounds between 0 and 21 pounds

#.  Usingd  mumerical summaries and a side-by-side box plot, determine if men tend to view their weight differently than women.
boxplot(wdiff ~ cdc$gender)

plot(cdc$gender,wdiff)

#.  Now it's time to get creative. Find the mean and standard deviation of weight and determine 
#   what proportion of the weights are within one standard deviation of the mean.

# Sample data (replace this with your actual data)
weights <- cdc$weight

# Calculate mean and standard deviation
mean_weight <- mean(weights)
sd_weight <- sd(weights)

# Calculate lower and upper bounds for one standard deviation
lower_bound <- mean_weight - sd_weight
upper_bound <- mean_weight + sd_weight

# Count how many weights are within one standard deviation
within_sd <- sum(weights >= lower_bound & weights <= upper_bound)

# Calculate the proportion
proportion_within_sd <- within_sd / length(weights)

# Print the proportion
cat("\n","Proportion within one standard deviation:", proportion_within_sd, "\n")

## 
##  Proportion within one standard deviation: 0.7076

Lecture 2

2023-09-11

R Markdown