This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
##Exercise 1
################### EXERCISE ########################################
source("http://www.openintro.org/stat/data/cdc.R")
## Exercise: Let us see some other numerical calulations.
output <- "[1]Create a numerical summary for height and compute the interquartile range"
cat("\n",output,"\n")
##
## [1]Create a numerical summary for height and compute the interquartile range
summary(cdc$height)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 48.00 64.00 67.00 67.18 70.00 93.00
output <- "[2]Create a numerical summary for age and compute the interquartile range"
cat("\n",output,"\n")
##
## [2]Create a numerical summary for age and compute the interquartile range
summary(cdc$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 18.00 31.00 43.00 45.07 57.00 99.00
## Compute the relative frequency distribution for gender and exerany.
numRows <- nrow(cdc)
output <- "[3]Relative frequency distribution for gender:"
cat("\n",output,"\n")
##
## [3]Relative frequency distribution for gender:
table(cdc$gender)/numRows
##
## m f
## 0.47845 0.52155
output <- "[4]Relative frequency distribution for exerany:"
cat("\n",output,"\n")
##
## [4]Relative frequency distribution for exerany:
table(cdc$exerany)/numRows
##
## 0 1
## 0.2543 0.7457
output <- "[5]How many males are in the sample?"
cat("\n",output,"\n")
##
## [5]How many males are in the sample?
table(cdc$gender)
##
## m f
## 9569 10431
output <- "[6]What proportion of the sample reports being in excellent health?"
cat("\n",output,"\n")
##
## [6]What proportion of the sample reports being in excellent health?
table(cdc$genhlth)/nrow(cdc)
##
## excellent very good good fair poor
## 0.23285 0.34860 0.28375 0.10095 0.03385
output <- "[7]HOW DO WE Create a new object called under23_and_smoke that contains all observations of respondents under the age of 23 that have smoked 100 cigarettes in their lifetime. Write the command you used to create the new object as the answer to this exercise.
"
cat("\n",output,"\n")
##
## [7]HOW DO WE Create a new object called under23_and_smoke that contains all observations of respondents under the age of 23 that have smoked 100 cigarettes in their lifetime. Write the command you used to create the new object as the answer to this exercise.
##
under23_and_smoke <- subset(cdc,age<23 & smoke100==1)
summary(under23_and_smoke)
## genhlth exerany hlthplan smoke100 height
## excellent:110 Min. :0.0000 Min. :0.0000 Min. :1 Min. :59.00
## very good:244 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:1 1st Qu.:65.00
## good :204 Median :1.0000 Median :1.0000 Median :1 Median :68.00
## fair : 53 Mean :0.8145 Mean :0.6952 Mean :1 Mean :67.92
## poor : 9 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1 3rd Qu.:71.00
## Max. :1.0000 Max. :1.0000 Max. :1 Max. :79.00
## weight wtdesire age gender
## Min. : 85.0 Min. : 80.0 Min. :18.00 m:305
## 1st Qu.:130.0 1st Qu.:125.0 1st Qu.:19.00 f:315
## Median :155.0 Median :150.0 Median :20.00
## Mean :158.9 Mean :152.2 Mean :20.22
## 3rd Qu.:180.0 3rd Qu.:175.0 3rd Qu.:21.00
## Max. :350.0 Max. :315.0 Max. :22.00
##
## I choose gender as another categorical variable to identify relationship with BMI
##
## In the plot, I see that Female seems to have lower BMI as compared to Males
### NOW ON YOUR OWN....
output <- "[1]Make a scatterplot of weight versus desired weight. Describe the relationship between these two variables."
cat("\n",output,"\n")
##
## [1]Make a scatterplot of weight versus desired weight. Describe the relationship between these two variables.
plot( cdc$weight , cdc$wtdesire )
corr <- cor(cdc$weight , cdc$wtdesire )
output <- "Weight and wtdesired are positvely correlated. As the weight increases the desired weight also increases."
cat("\n",output,"correlation is:", corr, "\n")
##
## Weight and wtdesired are positvely correlated. As the weight increases the desired weight also increases. correlation is: 0.8000521
#. Let's consider a new variable: the difference between desired weight (wtdesire) and current weight (weight).
# Create this new variable by subtracting the two columns in the data frame and assigning them to a new object called wdiff.
wdiff = cdc$wtdesire - cdc$weight
#. What type of data is wdiff? If an observation wdiff is 0, what does this mean about the person's weight and desired weight.
# What if wdiff is positive or negative?
output <- "[2]What type of data is wdiff"
cat("\n",output,"\n")
##
## [2]What type of data is wdiff
str(wdiff)
## int [1:20000] 0 -10 0 -8 -20 0 -9 -10 -20 -10 ...
output <- "If wdiff is ), that means the person is at an ideal weight. if wdiff is positive that means person is underweight and if wdiff is negative that means person is overweight."
cat("\n",output,"\n")
##
## If wdiff is ), that means the person is at an ideal weight. if wdiff is positive that means person is underweight and if wdiff is negative that means person is overweight.
#. Describe the distribution of wdiff in terms of its center, shape, and spread, including any plots you use.
summary(wdiff)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -300.00 -21.00 -10.00 -14.59 0.00 500.00
output <- "wdiff is left skewed."
cat("\n", "Center is : ", mean(wdiff))
##
## Center is : -14.5891
cat("\n", "Shape is :", output)
##
## Shape is : wdiff is left skewed.
hist(wdiff,breaks=20,las=1)
cat("\n", "Spread is :",range(wdiff))
##
## Spread is : -300 500
cat("\n", "Interquartile range is :",IQR(wdiff))
##
## Interquartile range is : 21
output <- "What does this tell us about how people feel about their current weight?: "
output1 <- "Majority of the people feel that they are overweight by few pounds between 0 and 21 pounds"
cat("\n",output,output1,"\n")
##
## What does this tell us about how people feel about their current weight?: Majority of the people feel that they are overweight by few pounds between 0 and 21 pounds
#. Usingd mumerical summaries and a side-by-side box plot, determine if men tend to view their weight differently than women.
boxplot(wdiff ~ cdc$gender)
plot(cdc$gender,wdiff)
#. Now it's time to get creative. Find the mean and standard deviation of weight and determine
# what proportion of the weights are within one standard deviation of the mean.
# Sample data (replace this with your actual data)
weights <- cdc$weight
# Calculate mean and standard deviation
mean_weight <- mean(weights)
sd_weight <- sd(weights)
# Calculate lower and upper bounds for one standard deviation
lower_bound <- mean_weight - sd_weight
upper_bound <- mean_weight + sd_weight
# Count how many weights are within one standard deviation
within_sd <- sum(weights >= lower_bound & weights <= upper_bound)
# Calculate the proportion
proportion_within_sd <- within_sd / length(weights)
# Print the proportion
cat("\n","Proportion within one standard deviation:", proportion_within_sd, "\n")
##
## Proportion within one standard deviation: 0.7076