R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

#Exercise 1 (Subsetting and Statistics)


##For this exercise, we will use the msleep dataset from the ggplot2 package. Note: for all parts, the answer should be obtained from coding with the code displayed in your submission, and not just typed as plain text.
##(a) First, load the ggplot2 package, and pull up the msleep dataset. What type of object is this dataset?
library(ggplot2)
#View(msleep)
summary(msleep)
##      name              genus               vore              order          
##  Length:83          Length:83          Length:83          Length:83         
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  conservation        sleep_total      sleep_rem      sleep_cycle    
##  Length:83          Min.   : 1.90   Min.   :0.100   Min.   :0.1167  
##  Class :character   1st Qu.: 7.85   1st Qu.:0.900   1st Qu.:0.1833  
##  Mode  :character   Median :10.10   Median :1.500   Median :0.3333  
##                     Mean   :10.43   Mean   :1.875   Mean   :0.4396  
##                     3rd Qu.:13.75   3rd Qu.:2.400   3rd Qu.:0.5792  
##                     Max.   :19.90   Max.   :6.600   Max.   :1.5000  
##                                     NA's   :22      NA's   :51      
##      awake          brainwt            bodywt        
##  Min.   : 4.10   Min.   :0.00014   Min.   :   0.005  
##  1st Qu.:10.25   1st Qu.:0.00290   1st Qu.:   0.174  
##  Median :13.90   Median :0.01240   Median :   1.670  
##  Mean   :13.57   Mean   :0.28158   Mean   : 166.136  
##  3rd Qu.:16.15   3rd Qu.:0.12550   3rd Qu.:  41.750  
##  Max.   :22.10   Max.   :5.71200   Max.   :6654.000  
##                  NA's   :27
   #the MSLEEP dataset is a dataframe object


## (b) How many observations ( rows )and how many variables (cols) are in this dataset?
 #the MSLEEP data set has 83 observations and 11 rows  
  dim(msleep)
## [1] 83 11
##(c) We want to find the mean hours of REM sleep of individuals in this dataset. First write a command to
##check if there is any missing data. (Hint: To avoid losing data, we only want to remove observations 
#where the variable of interest is missing). Then, calculate the mean hours of REM sleep, removing only what is necessary.
   sum(is.na(msleep$sleep_rem)) # checks for missing data 
## [1] 22
   msleep_cleaned =  remove_missing(msleep, na.rm = FALSE , vars = "sleep_rem")
## Warning: Removed 22 rows containing missing values or values outside the scale
## range.
   sleep_rem_mean = mean(msleep_cleaned$sleep_rem) #calculates mean for REM sleep hours 1.87541
   sleep_rem_mean
## [1] 1.87541
   summ = sum(msleep_cleaned$sleep_rem) #SUM of REM sleep hours
   summ
## [1] 114.4
      msleep$sleep_rem
##  [1]  NA 1.8 2.4 2.3 0.7 2.2 1.4  NA 2.9  NA 0.6 0.8 0.7 1.5 2.2 2.0 1.4 3.1 0.5
## [20] 4.9  NA 3.9 0.6 0.4 3.5 1.1  NA 3.2 1.1 0.4 0.1 1.5 0.6 1.9 0.9  NA 6.6 1.2
## [39] 1.9 3.1  NA 1.4 2.0  NA  NA 0.9  NA 0.9 0.6 1.4  NA  NA  NA 1.0 2.7  NA  NA
## [58] 1.8 0.4  NA 1.5 6.1 0.5 2.4  NA 1.4 2.1 1.1 2.4  NA 3.4 3.0 2.0 2.4  NA  NA
## [77] 1.0 2.3 2.6  NA 1.3  NA 2.4
##(d) What is the standard deviation of brain weight of individuals in this dataset?
     sd_brainwt = sd(msleep$brainwt , na.rm = TRUE)  #the standar deviation is  0.9764137
     sd_brainwt
## [1] 0.9764137
#??standardize 
## (e) Which observation (provide the name) in this dataset gets the most REM sleep?
     high_rem_row  = which.max(msleep$sleep_rem)     #37 Thick-tailed opposum
     high_rem_row 
## [1] 37
## (f) What is the average bodyweight of carnivores in this dataset?
    
     avg_bd_carni = mean(msleep$bodywt  ,trim = 0 , msleep$vore == "carni")
     avg_bd_carni
## [1] 166.1363
     # average bodyweight is 166.1363
#For this exercise, we will use the birthwt dataset from the MASS package.
  #View(birthwt)
#summary(birthwt)
library(MASS)
#??birthwt

#(a) observations  ( rowsin this data set represnt mothers that gave birth to infants with low birth weight
#(b) dataframe 
#(c)this dataset has 189 observations and 10 variables 
#(d)
    #birthwt %>%
      ggplot(data = birthwt,aes(x =  bwt, y = lwt, size = bwt, color = "orange")) +
      geom_point() +
      ggtitle("Scatter Chart: Birth weight vs Mothers weight, Size = bwt, Color = orange")

 # ??scatterplot
    
    
#(e) 
   # birthwt %>%
      ggplot(data = birthwt ,aes(x =  bwt, y = age, size = bwt, color = "pink")) +
      geom_point() +
      ggtitle("Scatter Chart: Birth weight vs Mothers Age, Size = bwt, Color = pink")

#(f) Create side-by-side boxplots for birth weight grouped by smoking status. Use non-default colors for the
#plot. (Also, be sure to give the plot a title and label the axes appropriately.) Based on the boxplot, does
#there seem to be a difference in birth weight for mothers who smoked? Briefly explain.
    boxplot(bwt ~ smoke, data = birthwt,
    ylab = "Birthweight",
    xlab = "Smokers" ,
    main = "Birth Weight for Non-Smoker vs Smokers",
    pch  = 20,
    cex  = 2, #increase size
    border = "darkorange" ,
    col = "dodgerblue") 

     # based on the box plot there seems to be a slight different in birth weight for mother who smoked during pregnancy
  # For this exercise we will use the data stored in nutrition-2018.csv. It contains the nutritional values per
#serving size for a large variety of foods as calculated by the USDA in 2018. It is a cleaned version totaling
#5956 observations and is current as of April 2018.

library(ggplot2)
library(MASS)
#reads file

path = "/Users/sweet/Downloads/nutrition-2018.csv"
nutrition = read.csv(path)
View(nutrition)

#(a) Create a histogram of Calories. Do not modify R’s default bin selection.
??histogram #helps for formatting
## starting httpd help server ... done
hist(nutrition$Calories,
     xlab = "Calories (kcal)",
     main = "Histogram of Calories for Various Foods",
     border = "dodgerblue",
     col  = "darkorange")

#(b) more foods in the nutrition dataset have calories less than 400kcal , the distrubution of Calories are right-skewed , there are two odd points at 400kcal and past the 800kcal

#(c) Create a scatter plot of carbs (x-axis) vs calories (y-axis), with title and all axis labels. Comment on

  # scatter plot formula for carbs vs calories 
 # nutrition%>%
  ggplot(data = nutrition,aes(x = Carbs, y = Calories), size = Calories, color = blue) + 
    geom_point(color= "blue") + 
    labs(x = "Carbs (g)",  y = "Calories ", title = "Carbs (g) Vs  Carlories (kcal) ") + theme(plot.title = element_text(size = rel(.9)))

#(d) 

plot(Calories ~ I(4 * Protein + 4 * Carbs + 9 * Fat + 2 * Fiber), data = nutrition,
     xlab = "Protein (grams)",
     ylab = "Calories (kcal)",
     main = "Calories vs Protein",
     pch  = 20,
     cex  = 1,
     col  = "darkorange")

   #A reason the scatter plot might not be linear becuse of the complexity with different types of food , ie not all carbs will be high in protein or some fruits being low in sugar compared to others,
   #also could be having to round some numbers or 
   #with dealing with numbers errors in meausrement 
  #hw excercise 4
# a)
 a = 1:20
 sum_of_squares= function(x) {
   sum ( x ^ 2)
   
 }
   sum_of_squares(x = a)
## [1] 2870
#b)
   x = c(11200, 7900, 7900 , 4900, 4700)
  
     n = length(x) 
    ss = sum_of_squares(x - mean(x))
     sam_var= ss /(n-1 )
   sam_var 
## [1] 7112000
   #actual var function
   var(x)
## [1] 7112000
#c)  
   path = "/Users/sweet/Downloads/nutrition-2018.csv"
   nutrition = read.csv(path)
   #View(nutrition)
   
#  difs =sqrt(sum_of_squares(x = c(a - b)) / length(a)) 
  #difs
 #a) 

list_extreme_values = function(x, k = 2){
  s = ifelse (x < mean(x) - k * sd(x), TRUE, FALSE)
  small = x[s]
  
  l = ifelse (x > mean(x) + k * sd(x), TRUE, FALSE)
  large = x[l]
  
  return(list(small, large))
}
 

#b)


## mean(list_extreme_values(x=y, k= 1.5)[[2]])