This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
#Exercise 1 (Subsetting and Statistics)
##For this exercise, we will use the msleep dataset from the ggplot2 package. Note: for all parts, the answer should be obtained from coding with the code displayed in your submission, and not just typed as plain text.
##(a) First, load the ggplot2 package, and pull up the msleep dataset. What type of object is this dataset?
library(ggplot2)
#View(msleep)
summary(msleep)
## name genus vore order
## Length:83 Length:83 Length:83 Length:83
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## conservation sleep_total sleep_rem sleep_cycle
## Length:83 Min. : 1.90 Min. :0.100 Min. :0.1167
## Class :character 1st Qu.: 7.85 1st Qu.:0.900 1st Qu.:0.1833
## Mode :character Median :10.10 Median :1.500 Median :0.3333
## Mean :10.43 Mean :1.875 Mean :0.4396
## 3rd Qu.:13.75 3rd Qu.:2.400 3rd Qu.:0.5792
## Max. :19.90 Max. :6.600 Max. :1.5000
## NA's :22 NA's :51
## awake brainwt bodywt
## Min. : 4.10 Min. :0.00014 Min. : 0.005
## 1st Qu.:10.25 1st Qu.:0.00290 1st Qu.: 0.174
## Median :13.90 Median :0.01240 Median : 1.670
## Mean :13.57 Mean :0.28158 Mean : 166.136
## 3rd Qu.:16.15 3rd Qu.:0.12550 3rd Qu.: 41.750
## Max. :22.10 Max. :5.71200 Max. :6654.000
## NA's :27
#the MSLEEP dataset is a dataframe object
## (b) How many observations ( rows )and how many variables (cols) are in this dataset?
#the MSLEEP data set has 83 observations and 11 rows
dim(msleep)
## [1] 83 11
##(c) We want to find the mean hours of REM sleep of individuals in this dataset. First write a command to
##check if there is any missing data. (Hint: To avoid losing data, we only want to remove observations
#where the variable of interest is missing). Then, calculate the mean hours of REM sleep, removing only what is necessary.
sum(is.na(msleep$sleep_rem)) # checks for missing data
## [1] 22
msleep_cleaned = remove_missing(msleep, na.rm = FALSE , vars = "sleep_rem")
## Warning: Removed 22 rows containing missing values or values outside the scale
## range.
sleep_rem_mean = mean(msleep_cleaned$sleep_rem) #calculates mean for REM sleep hours 1.87541
sleep_rem_mean
## [1] 1.87541
summ = sum(msleep_cleaned$sleep_rem) #SUM of REM sleep hours
summ
## [1] 114.4
msleep$sleep_rem
## [1] NA 1.8 2.4 2.3 0.7 2.2 1.4 NA 2.9 NA 0.6 0.8 0.7 1.5 2.2 2.0 1.4 3.1 0.5
## [20] 4.9 NA 3.9 0.6 0.4 3.5 1.1 NA 3.2 1.1 0.4 0.1 1.5 0.6 1.9 0.9 NA 6.6 1.2
## [39] 1.9 3.1 NA 1.4 2.0 NA NA 0.9 NA 0.9 0.6 1.4 NA NA NA 1.0 2.7 NA NA
## [58] 1.8 0.4 NA 1.5 6.1 0.5 2.4 NA 1.4 2.1 1.1 2.4 NA 3.4 3.0 2.0 2.4 NA NA
## [77] 1.0 2.3 2.6 NA 1.3 NA 2.4
##(d) What is the standard deviation of brain weight of individuals in this dataset?
sd_brainwt = sd(msleep$brainwt , na.rm = TRUE) #the standar deviation is 0.9764137
sd_brainwt
## [1] 0.9764137
#??standardize
## (e) Which observation (provide the name) in this dataset gets the most REM sleep?
high_rem_row = which.max(msleep$sleep_rem) #37 Thick-tailed opposum
high_rem_row
## [1] 37
## (f) What is the average bodyweight of carnivores in this dataset?
avg_bd_carni = mean(msleep$bodywt ,trim = 0 , msleep$vore == "carni")
avg_bd_carni
## [1] 166.1363
# average bodyweight is 166.1363
#For this exercise, we will use the birthwt dataset from the MASS package.
#View(birthwt)
#summary(birthwt)
library(MASS)
#??birthwt
#(a) observations ( rowsin this data set represnt mothers that gave birth to infants with low birth weight
#(b) dataframe
#(c)this dataset has 189 observations and 10 variables
#(d)
#birthwt %>%
ggplot(data = birthwt,aes(x = bwt, y = lwt, size = bwt, color = "orange")) +
geom_point() +
ggtitle("Scatter Chart: Birth weight vs Mothers weight, Size = bwt, Color = orange")
# ??scatterplot
#(e)
# birthwt %>%
ggplot(data = birthwt ,aes(x = bwt, y = age, size = bwt, color = "pink")) +
geom_point() +
ggtitle("Scatter Chart: Birth weight vs Mothers Age, Size = bwt, Color = pink")
#(f) Create side-by-side boxplots for birth weight grouped by smoking status. Use non-default colors for the
#plot. (Also, be sure to give the plot a title and label the axes appropriately.) Based on the boxplot, does
#there seem to be a difference in birth weight for mothers who smoked? Briefly explain.
boxplot(bwt ~ smoke, data = birthwt,
ylab = "Birthweight",
xlab = "Smokers" ,
main = "Birth Weight for Non-Smoker vs Smokers",
pch = 20,
cex = 2, #increase size
border = "darkorange" ,
col = "dodgerblue")
# based on the box plot there seems to be a slight different in birth weight for mother who smoked during pregnancy
# For this exercise we will use the data stored in nutrition-2018.csv. It contains the nutritional values per
#serving size for a large variety of foods as calculated by the USDA in 2018. It is a cleaned version totaling
#5956 observations and is current as of April 2018.
library(ggplot2)
library(MASS)
#reads file
path = "/Users/sweet/Downloads/nutrition-2018.csv"
nutrition = read.csv(path)
View(nutrition)
#(a) Create a histogram of Calories. Do not modify R’s default bin selection.
??histogram #helps for formatting
## starting httpd help server ... done
hist(nutrition$Calories,
xlab = "Calories (kcal)",
main = "Histogram of Calories for Various Foods",
border = "dodgerblue",
col = "darkorange")
#(b) more foods in the nutrition dataset have calories less than 400kcal , the distrubution of Calories are right-skewed , there are two odd points at 400kcal and past the 800kcal
#(c) Create a scatter plot of carbs (x-axis) vs calories (y-axis), with title and all axis labels. Comment on
# scatter plot formula for carbs vs calories
# nutrition%>%
ggplot(data = nutrition,aes(x = Carbs, y = Calories), size = Calories, color = blue) +
geom_point(color= "blue") +
labs(x = "Carbs (g)", y = "Calories ", title = "Carbs (g) Vs Carlories (kcal) ") + theme(plot.title = element_text(size = rel(.9)))
#(d)
plot(Calories ~ I(4 * Protein + 4 * Carbs + 9 * Fat + 2 * Fiber), data = nutrition,
xlab = "Protein (grams)",
ylab = "Calories (kcal)",
main = "Calories vs Protein",
pch = 20,
cex = 1,
col = "darkorange")
#A reason the scatter plot might not be linear becuse of the complexity with different types of food , ie not all carbs will be high in protein or some fruits being low in sugar compared to others,
#also could be having to round some numbers or
#with dealing with numbers errors in meausrement
#hw excercise 4
# a)
a = 1:20
sum_of_squares= function(x) {
sum ( x ^ 2)
}
sum_of_squares(x = a)
## [1] 2870
#b)
x = c(11200, 7900, 7900 , 4900, 4700)
n = length(x)
ss = sum_of_squares(x - mean(x))
sam_var= ss /(n-1 )
sam_var
## [1] 7112000
#actual var function
var(x)
## [1] 7112000
#c)
path = "/Users/sweet/Downloads/nutrition-2018.csv"
nutrition = read.csv(path)
#View(nutrition)
# difs =sqrt(sum_of_squares(x = c(a - b)) / length(a))
#difs
#a)
list_extreme_values = function(x, k = 2){
s = ifelse (x < mean(x) - k * sd(x), TRUE, FALSE)
small = x[s]
l = ifelse (x > mean(x) + k * sd(x), TRUE, FALSE)
large = x[l]
return(list(small, large))
}
#b)
## mean(list_extreme_values(x=y, k= 1.5)[[2]])