Foundations_of_Data_Analysis_UT_Austin-Part

#Part 1-Week 1 Introduction to Data
#(weeks 1-3)
#dataset
#from https://d37djvu3ytnwxt.cloudfront.net/assets/courseware/v1/2246f2ff1542f43a14f712c9b18d5b41/asset-v1:UTAustinX+UT.7.11x+2T2017+type@asset+block/BikeData.csv

#install library packages
library(ggplot2)
library(data.table)
library(dplyr)

## -------------------------------------------------------------------------

## data.table + dplyr code now lives in dtplyr.
## Please library(dtplyr)!

## -------------------------------------------------------------------------

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:data.table':
## 
##     between, first, last

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(sqldf)

## Loading required package: gsubfn

## Loading required package: proto

## Loading required package: RSQLite

library(plyr)

## -------------------------------------------------------------------------

## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)

## -------------------------------------------------------------------------

## 
## Attaching package: 'plyr'

## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

#import dataset from url
bike<-fread("https://d37djvu3ytnwxt.cloudfront.net/assets/courseware/v1/2246f2ff1542f43a14f712c9b18d5b41/asset-v1:UTAustinX+UT.7.11x+2T2017+type@asset+block/BikeData.csv")

#1a. What is the age of the 7th rider in the dataset?
bike[7,]

##    user_id age gender student employed               cyc_freq distance
## 1:       7  45      M       0        1 Several times per week    13.03
##    time speed
## 1:   51 15.33

#1b. How many of the first 10 riders in the dataset ride daily?

first10rows<-bike[1:10,]
sqldf('select count(*) from first10rows where cyc_freq = "Daily" ')

## Loading required package: tcltk

## Warning: Quoted identifiers should have class SQL, use DBI::SQL() if the
## caller performs the quoting.

##   count(*)
## 1        3

#1c. What is the speed of the first female who cycles less than one time per month (in miles/hour)?
females<-sqldf('select user_id, gender, cyc_freq, speed from bike where gender="F" ')

#or more succintly using base R
infrequentFemaleRiders<-bike [which(bike$gender == "F" &
                            bike$cyc_freq == "Less than once a month"),]

#Look at the first line of code:
table(bike$student)

## 
##   0   1 
## 107  14

#1a. What is the name of the dataframe?

#This line creates a new dataframe called "student":
  student <- bike[bike$student==1,]
#2. What is the role of the comma?
  
#3. What does "student" refer to in this line of code?:
    table(student$cyc_freq)

## 
##                  Daily Several times per week 
##                      8                      6

#This line of code creates a vector called "distance:"
    distance <-student$distance
#4. What does this vector consist of?
#the distance each student rides  
    
#Suppose we have run the following code to try to create a list of the times.
#Read in the dataset and name it bike
    #bike<-BikeData
#Create a vector of the times
    rider_times<-bike$times
#Check the contents of our rider_times vector
    rider_times

## NULL

#5. What error has caused the vector of rider_times to be empty? (Examine the data set in R for help.)
#incorrect variable name
  
#Week 1: Primary Research Question
#How many of the cyclists were students,
#how often did they ride, and what was the
#average distance they rode?
    
#1. How many students are in the dataset? (Hint: Look at the output for table(bike$student).)
students<-sqldf('select * from bike where student = "1" ')
dim(students)

## [1] 14  9

#2. How many variables are in the new data frame "student"? (Hint: Look in your Workspace for the new dataframe.)
dim(students)

## [1] 14  9

#3. We want to know how often the students ride. What is the most frequently observed response?
rideFrequency<-sqldf('select cyc_freq, count (*) as total_students from students group by cyc_freq ')

#4. How is the vector "distance" described in the workspace?
str(distance)

##  num [1:14] 3.25 10.94 9.34 1.25 9.29 ...

#5. How far do students ride on average? (Round to the nearest 0.01)
round(mean(distance),2)

## [1] 6.26

#R Basics Quiz 3
#1. What type of dataset file extension is most easily imported in R?
#csv

#2. In R terminology, what is the name for a matrix with cases in rows and variables in columns?
#data frame

#3. What will the following line of code do when we run it?
BikeData<-bike
table(BikeData$employed)

## 
##   0   1 
##  12 109

#tells us who is employed and unemployed

#4. If we index BikeData with the following line of code, what value will result?
BikeData[8,7]

##    distance
## 1:     6.21

#5. If we modify the above code as follows and run the code, what will result?
BikeData[,7]

##      distance
##   1:     3.25
##   2:     1.11
##   3:     5.59
##   4:     3.24
##   5:     7.81
##  ---         
## 117:     8.14
## 118:     3.00
## 119:     7.84
## 120:     5.88
## 121:     9.01

#a vector of distance values

#6. You would like to create a new data frame from BikeData which contains only employed cyclists. What should your code look like?
employed<-BikeData[BikeData$employed=='1',]
head(employed)

##    user_id age gender student employed               cyc_freq distance
## 1:       1  28      M       1        1                  Daily     3.25
## 2:       2  35      M       0        1                  Daily     1.11
## 3:       3  28      M       0        1                  Daily     5.59
## 4:       4  44      F       0        1 Less than once a month     3.24
## 5:       5  42      M       0        1 Several times per week     7.81
## 6:       6  36      M       0        1 Several times per week     3.00
##    time speed
## 1:   15 13.00
## 2:    5 13.32
## 3:   23 14.58
## 4:   24  8.10
## 5:   26 18.02
## 6:   20  9.00

#7. You would like to create a vector of distances for employed cyclists. What will your code look like?
employed_distance<-BikeData$distance[BikeData$employed=='1']
str(employed_distance)

##  num [1:109] 3.25 1.11 5.59 3.24 7.81 ...

#Week 1 LAB

#Analyze the data
#1. How many daily riders are in the dataset?
dailyRiders<-sqldf('select cyc_freq, count (*) as total_riders from bike group by cyc_freq ')
dailyRiders

##                  cyc_freq total_riders
## 1                   Daily           47
## 2  Less than once a month            2
## 3 Several times per month           14
## 4  Several times per week           58

#2. How many of the daily riders are male?
dailyRidersMale<-sqldf('select cyc_freq, count (*) as total_riders from bike 
                   where gender = "M" group by cyc_freq ')
dailyRidersMale

##                  cyc_freq total_riders
## 1                   Daily           38
## 2 Several times per month            9
## 3  Several times per week           43

#3. What is the average age of daily riders? (Round to 1 decimal place.)

avgAgedailyRiders<-sqldf('select cyc_freq, round(avg(age),1) as avg_age from bike 
                    where cyc_freq= "Daily" group by cyc_freq ')
avgAgedailyRiders

##   cyc_freq avg_age
## 1    Daily    33.7

#4. What is the average age of the female daily riders? (Round to 1 decimal place)
avgAgedailyFemaleRiders<-sqldf('select cyc_freq, round(avg(age),1) as avg_age from bike 
                    where cyc_freq= "Daily" and gender="F" group by cyc_freq ')
avgAgedailyFemaleRiders

##   cyc_freq avg_age
## 1    Daily    32.6

#5. What is the average age of the male daily riders? (Round to 1 decimal place.)
avgAgedailyMaleRiders<-sqldf('select cyc_freq, round(avg(age),1) as avg_age from bike 
                    where cyc_freq= "Daily" and gender="M" group by cyc_freq ')
avgAgedailyMaleRiders

##   cyc_freq avg_age
## 1    Daily    33.9

#6. How many daily male riders are age 30 or older?
over30maledaily<-sqldf('select cyc_freq, count(*) as total_riders from bike 
                    where age >= 30 and gender = "M" group by cyc_freq ')
over30maledaily

##                  cyc_freq total_riders
## 1                   Daily           25
## 2 Several times per month            9
## 3  Several times per week           33

#Week 2: Univariate Descriptive Stats

#Graphing a single variable

x<-c(10, 2, 6, 12, 14, 15, 15, 24, 15, 25, 3, 12)

#problems
#What is the mean of the sample? (Round to 2 decimal places.)
round(mean(x),2)

## [1] 12.75

#What is the standard deviation of the sample? (Round to 2 decimal places.)
round(sd(x),2)

## [1] 7.12

#min =
min(x)

## [1] 2

#iqr
IQR(x)

## [1] 6

quantile(x)

##   0%  25%  50%  75% 100% 
##    2    9   13   15   25

#The mean number of overtime hours was (Round to 1 decimal place.)
round(mean(x),1)

## [1] 12.8

#The median number of overtime hours was
median(x)

## [1] 13

#The range was
range(x)

## [1]  2 25

25-2

## [1] 23

#What is the shape of this distribution of scores?
hist(x)

#kernel density plot
d<-density(x)
plot(d)
plot(density(x)) #run code if data is a numeric vector

#compute z-score
#zscore(x, na.rm = getOption("na.rm", FALSE))

#COMPUTE Z-SCORE
#1-pnorm(x, mean_of_dataset, standard_deviation)

#pnorm(q, mean = 0, sd = 1, lower.tail = TRUE, log.p = FALSE)

#A distribution has a  = 4.  Find the z-score for a score that is:
#4 points above the mean: z = ____
4/4

## [1] 1

#12 points above the mean: z = ____
12/4

## [1] 3

#2 points below the mean: z = ____
-2/4

## [1] -0.5

#8 points below the mean: z = ____
-8/4

## [1] -2

#With a height of 75 in., Lyndon Johnson was the tallest president in the 20th century.  Presidents of the past century have heights with a mean of 71.5 in. and a standard deviation of 2.1 in.  
#With a height of 85 in., Shaquille O’Neal was the tallest player on the Miami Heat basketball team.  Basketball players for the Miami Heat during 2004 to 2008 had heights with a mean of 80.0 in. and a standard deviation of 3.3 in.

#What is the zscore for Lyndon Johnson
LBJzscore<-round((75-71.5)/2.1,2)
LBJzscore

## [1] 1.67

#Lyndon was taller than average
#prnom gives you the percentage who were taller than Lyndon
round(1-pnorm(75, mean=71.5, sd=2.1),2)

## [1] 0.05

#What is the z‐score for Shaquille O’Neal? (Round to 2 decimal places.)
Shaqzscore<-round((85-80)/3.3,2)
Shaqzscore

## [1] 1.52

##prnom gives you the percentage who were taller than Shaq
round(1-pnorm(85, mean=80, sd=3.3),2)

## [1] 0.06

#Who is relatively taller among their respective groups: Lyndon Johnson or Shaquille O’Neal?
#Answer: Lyndon Johnson-only 5% of presidents were taller than LBJ
#6% of other basketball members were taller than Shaq

#Which of the following is NOT a characteristic of a Normal Distribution?
#Half the data values are positive, and half the data values are negative.

#Week 2--Primary Research Question
#How many days do animals spend in the shelter before they are adopted?

#import dataset from url
animaldata<-fread("https://d37djvu3ytnwxt.cloudfront.net/assets/courseware/v1/78cc1365ee7f9798d6dfd02cb35aab74/asset-v1:UTAustinX+UT.7.11x+2T2017+type@asset+block/AnimalData.csv")


#1a. How many variables are in this dataset?
#inspect dataset
glimpse(animaldata)

## Observations: 473
## Variables: 24
## $ Impound.No          <chr> "K12-000031", "K12-000037", "K12-000108", ...
## $ Intake.Date         <chr> "1/1/12", "1/1/12", "1/1/12", "1/1/12", "1...
## $ Intake.Type         <chr> "Stray", "Stray", "Stray", "Stray", "Stray...
## $ Animal.Type         <chr> "Dog", "Dog", "Dog", "Dog", "Dog", "Dog", ...
## $ Neutered.Status     <chr> "Spayed", "Intact", "Intact", "Neutered", ...
## $ Sex                 <chr> "Female", "Female", "Male", "Male", "Male"...
## $ Age.Intake          <int> 10, 3, 2, 0, 3, 5, 2, 1, 1, 6, 1, 1, 8, 15...
## $ Condition           <chr> "Injured or Sick", "Normal", "Normal", "No...
## $ Breed               <chr> "Chihuahua Sh Mix", "Rat Terrier Mix", "Pi...
## $ Aggressive          <chr> "N", "N", "N", "N", "N", "N", "N", "N", "N...
## $ Independent         <chr> "N", "N", "N", "Y", "Y", "N", "N", "N", "N...
## $ Intelligent         <chr> "Y", "Y", "N", "Y", "Y", "N", "Y", "Y", "Y...
## $ Loyal               <chr> "N", "N", "Y", "Y", "Y", "Y", "Y", "N", "N...
## $ Social              <chr> "N", "Y", "N", "Y", "Y", "N", "Y", "N", "N...
## $ Good.with.Kids      <chr> "N", "Y", "Y", "Y", "Y", "Y", "Y", "N", "N...
## $ Max.Life.Expectancy <int> 18, 14, 14, 12, 12, 15, 13, 18, 18, 18, 15...
## $ Max.Weight          <int> 6, 25, 90, 79, 79, 7, 80, 6, 6, 6, 20, 35,...
## $ Dog.Group           <chr> "Toy", "Terrier", "Terrier", "Sporting", "...
## $ Color               <chr> "Tan & White", "White & Brown", "Blue & Wh...
## $ Weight              <dbl> 3.30, 7.50, 74.00, 22.00, 54.00, 4.80, 47....
## $ Lab.Test            <chr> "Heartworm Negative", "No Lab Test", "Hear...
## $ Outcome.Date        <chr> " 1/7/12", " 1/3/12", " 1/13/12", " 1/8/12...
## $ Outcome.Type        <chr> "Adoption", "Return to Owner", "Humane Eut...
## $ Days.Shelter        <int> 6, 2, 12, 7, 94, 9, 25, 13, 12, 14, 11, 12...

#1b. How many of the first 10 animals in the dataset were adopted?
first10animals<-animaldata[1:10,]

count(first10animals, "Outcome.Type")

##        Outcome.Type freq
## 1          Adoption    4
## 2 Humane Euthanasia    2
## 3   Return to Owner    3
## 4          Transfer    1

#1c. Was the first owner-surrendered animal in the dataset neutered?
count(animaldata, "Intake.Type")

##          Intake.Type freq
## 1 Euthanasia Request    1
## 2    Owner Surrender  129
## 3      Public Assist   23
## 4              Stray  320

#subset data
owner<-subset(animaldata, Intake.Type="Owner Surrender")

#2 Variables of interest: days.shelter & outcome.type

#3. We will use descriptive statistics to answer this question of interest. Why?
#We want to describe the distribution of a quantitative variable

#4. We should generate a histogram of the distribution before we calculate descriptive measures of center and spread. Why?
#We need to check the shape of the distribution


#Breakdown Your Analysis

#Let's break this analysis into its required steps:

#1. Determine which animals in the dataset were adopted.
#2. Generate a histogram for the length of time these adopted animals were in the shelter.
#3. Select the appropriate measures of center and spread to describe the distribution.
#4. Identify which animal was an outlier on this particular variable.

#find the number of animals that were adopted
count(animaldata, "Outcome.Type")

##        Outcome.Type freq
## 1          Adoption  204
## 2              Died    3
## 3 Humane Euthanasia   39
## 4   Return to Owner   73
## 5          Transfer  154

#pull out adopted animals
adopted <- animaldata[animaldata$Outcome.Type=="Adoption",]

#Pull out just the days in shelter for the adopted animals
daystoadopt <- adopted$Days.Shelter

#Visualize and describe this variable
hist(daystoadopt)
fivenum(daystoadopt)

## [1]   2   8  13  38 211

mean(daystoadopt)

## [1] 29.26471

sd(daystoadopt)

## [1] 35.71547

which(animaldata$Days.Shelter==max(daystoadopt))

## [1] 425

#1. What will the line of the code do for us?
table(animaldata$Outcome.Type)

## 
##          Adoption              Died Humane Euthanasia   Return to Owner 
##               204                 3                39                73 
##          Transfer 
##               154

#It will generate a frequency table to show us how many animals experienced each type of outcome. correct

#2. We are creating a new object called daystoadopt. What does this object contain?
daystoadopt

##   [1]   6   7  94  14  11  12   9  92  28  28  16   9  15   2  22  11  13
##  [18]   5   8   8  12   8  16  28  30  13  16  72  27   6   8  27  42  14
##  [35]  17  61   6   8  66  66  53   9  36  17  14  10  13   9   5  73  87
##  [52]   8   7   9  14   7  83  49  16  46   4  97   2  12   3  69   7  62
##  [69]   6 152   8   8  22  10   7  48   8 130   7  55   8  53   9   4   7
##  [86]  11  34  67  50  51  17  59  26 201  12  18   9  10   7   9   8   9
## [103]   8  22   7  50   7  41  18  10  51  17   7   8   7   8  69  78  49
## [120]  10  33  29   7   6 123  43 159  13  10  22  76   7  13  34  10  13
## [137] 110  78  11   8  13   8   2 109   6   6   6   7  21  10  36   8   7
## [154]  16  11   8  92 128  58   7   5  12   5  22  16  18   9   6  33 120
## [171]   8   5  13  94  64  66  10  12  49  15  14  33   6  14   6  34 211
## [188]   6  91  10  15  52   8   7  40  11  12  27   7   7   8  11   8   5

#The number of days that animals spent in the shelter if their outcome type was "Adoption."

#3. Which line in the R code produces a visual of the distribution of daystoadopt?
hist(daystoadopt)

#4. The following line of R code will produce a row number:
  which(animaldata$Days.Shelter==max(daystoadopt))

## [1] 425

#What will this row number tell us?
animaldata[425,]

##    Impound.No Intake.Date Intake.Type Animal.Type Neutered.Status  Sex
## 1: K12-020743    11/18/12       Stray         Dog        Neutered Male
##    Age.Intake       Condition                           Breed Aggressive
## 1:          2 Injured or Sick Aust Cattle Dog & Labrador Retr          Y
##    Independent Intelligent Loyal Social Good.with.Kids Max.Life.Expectancy
## 1:           N           Y     Y      N              Y                  16
##    Max.Weight Dog.Group       Color Weight           Lab.Test Outcome.Date
## 1:         35   Herding White & Tan  48.25 Heartworm Negative      6/17/13
##    Outcome.Type Days.Shelter
## 1:     Adoption          211

#It will tell us the row that contains the animal that took the longest to be adopted.

#5. Suppose we have run the following code to subset the dataset for only male animals. What is the cause of the error below? (Examine the data set in R for help.)
#animaldata<-AnimalData 
males<-animaldata[animaldata$Sex == 'Male'] 
#Error in [.data.frame`(animaldata, animaldata$Sex == "Male") :
         #   undefined columns selected
#Answer: We are missing a comma inside of the brackets


#Conduct the Analysis in R
#1. How would you describe the shape of the distribution of daystoadopt?
hist(daystoadopt)
#positively skewed

#2. Which measures of center and spread should you report for this data?
#median and IQR

#Enter numerical values for the following:

#3a. Center=
median(daystoadopt)

## [1] 13

#3b. Spread=
#don't use IQR or summary, use fivenum instead
#subtract q3 from q1 (38-8=30)
#Answer: 30
fivenum(daystoadopt)

## [1]   2   8  13  38 211

#It looks like one adopted animal spent much more time in the shelter than the others.

#4a. How many days was this animal in the shelter?
max(daystoadopt)

## [1] 211

#4b. What was the z-score for this particular animal? Round to the nearest TWO decimal places.
mean(daystoadopt)

## [1] 29.26471

sd(daystoadopt)

## [1] 35.71547

OutlierAnimalzscore<-round((max(daystoadopt)-mean(daystoadopt))/sd(daystoadopt),2)
OutlierAnimalzscore

## [1] 5.09

#or
round((211-29.26)/35.71,2)

## [1] 5.09

#5. Why should we NOT report a z-score for this animal, even though we can calculate one?
#The distribution is skewed.

#Conclusion
#The middle 50% of the distribution were adopted between 8 and 38 days.
longest<-animaldata[animaldata$Days.Shelter==211,]
longest

##    Impound.No Intake.Date Intake.Type Animal.Type Neutered.Status  Sex
## 1: K12-020743    11/18/12       Stray         Dog        Neutered Male
##    Age.Intake       Condition                           Breed Aggressive
## 1:          2 Injured or Sick Aust Cattle Dog & Labrador Retr          Y
##    Independent Intelligent Loyal Social Good.with.Kids Max.Life.Expectancy
## 1:           N           Y     Y      N              Y                  16
##    Max.Weight Dog.Group       Color Weight           Lab.Test Outcome.Date
## 1:         35   Herding White & Tan  48.25 Heartworm Negative      6/17/13
##    Outcome.Type Days.Shelter
## 1:     Adoption          211

#LAB-Reflect on the Question

#1a. In this lab you will use descriptive statistics to answer a question of interest. Let’s start by remembering why we calculate descriptive statistics.
#Descriptive statistics can tell us:
#what the distribution of a variable looks like

#1b. Which of the following are examples of descriptive statistics?
#The mean and standard deviation of a distribution.

#2. One of the following questions will be answered in this lab using descriptive statistics. Select the question that can be answered with descriptive statistics.
#How much do adult cats and dogs at the shelter weigh?

#import dataset from url
animaldata<-fread("https://d37djvu3ytnwxt.cloudfront.net/assets/courseware/v1/78cc1365ee7f9798d6dfd02cb35aab74/asset-v1:UTAustinX+UT.7.11x+2T2017+type@asset+block/AnimalData.csv")


#Primary Research Question
#Compare the weight of adult cats and dogs at the shelter.  How typical would it be to find a 13-pound cat?  What about a 13-pound dog?

#1a. How many adult dogs are in the shelter?
dogs<-animaldata[ which(animaldata$Animal.Type=='Dog' & animaldata$Age.Intake >= 1),]
count(dogs, "Animal.Type")

##   Animal.Type freq
## 1         Dog  226

#226

#1b. How many adult cats are in the shelter?
##subsetting dats
cats<-animaldata[ which(animaldata$Animal.Type=='Cat' & animaldata$Age.Intake >= 1),]
count(cats, "Animal.Type")

##   Animal.Type freq
## 1         Cat   56

#56

#2a. What is the shape of the distribution of weight for adult dogs?
dogweight<-dogs$Weight
hist(dogweight)

#positively skewed

#2b. What is the shape of the distribution of weight for adult cats?
catweight<-cats$Weight
hist(catweight)

#approx normal

#3a. Which measure of center should be used to describe the average weight of the adult cats?
#mean

#3b. Average adult cat weight in pounds (rounded to one decimal place)=
round(mean(catweight),1)

## [1] 8.6

#4. What is the standard deviation for the weight of the adult cats? Round to two decimal places.
round(sd(catweight),2)

## [1] 1.91

#5. What is the z-score of a 13 pound adult cat? Round to one decimal point.
round((13-8.6)/1.91,2)

## [1] 2.3

catZscore<-round((13-mean(catweight))/sd(catweight),1)
catZscore

## [1] 2.3

#6. Which of these best describes the location of a 13 pound adult cat in the shelter distribution?
boxplot(catweight)

hist(catweight)

#More than 2 standard deviations above the mean.

#7. What proportion of adult cats weigh more than 13 pounds, according to your data? Use the following code to answer this question: 1-pnorm(zcat). Replace "zcat" with your z-score for the cat. Round to three decimal places.
#prnom gives you the proportion/percentage 
z<-round(1-pnorm(13, mean=8.6, sd=1.91),3)

#Looking now at the descriptive statistics for the weight of adult dogs in the shelter:

#8a. What quartile would contain a 13-pound adult dog?
fivenum(dogweight)

## [1]   3.30  13.50  35.25  54.00 131.00

#8b. What percentage of adult dogs in the shelter weigh more than 13 pounds?
dogPercent<-round(1-pnorm(13, mean=35.67035, sd=23.46612),2)
dogPercent

## [1] 0.83

hist(dogweight)

#answer: approx 75%

#About 95% of adult cats at the shelter weigh between 4.8 & 12.4lbs 
#http://math.tutorvista.com/statistics/empirical-rule.html
adultCats<-animaldata[ which(animaldata$Animal.Type=='Cat' & animaldata$Age.Intake >= 1),]
adultCatsWeights<-adultCats$Weight

#this is the formula (above and then below the mean)
mean(adultCatsWeights)+2*sd(adultCatsWeights)

## [1] 12.4266

mean(adultCatsWeights)+-2*sd(adultCatsWeights)

## [1] 4.780538

#Distances from mean Percentages 
#μ±σμ±σ ≈≈ 68% 
#μ±2σμ±2σ ≈≈ 95% 
#μ±3σμ±3σ ≈≈ 99.7% 


#Week 2-Problem Set 1
#1a. What was the most common way that dogs arrived in the shelter? (as defined by the “Intake.Type” variable)
dogs<-animaldata[ which(animaldata$Animal.Type=='Dog'),]
count(dogs$Intake.Type)

##                    x freq
## 1 Euthanasia Request    1
## 2    Owner Surrender   81
## 3      Public Assist   20
## 4              Stray  189

#stray

#1b. What proportion of dogs were brought to the shelter as an owner surrender? (Round to 3 decimal places.)
intakeTable<-table(dogs$Intake.Type)
round(prop.table(intakeTable),3)

## 
## Euthanasia Request    Owner Surrender      Public Assist 
##              0.003              0.278              0.069 
##              Stray 
##              0.649

#.278

#1c. Of the dogs that were brought to the shelter as an owner surrender, how many were returned to their owner?
ownerSurrender<-dogs[ which(dogs$Intake.Type=="Owner Surrender"),]
count(ownerSurrender$Outcome.Type)

##                   x freq
## 1          Adoption   33
## 2 Humane Euthanasia   13
## 3   Return to Owner    2
## 4          Transfer   33

#2

#1d. What was the mean number of days that the dogs referenced in Question 1c spent at the shelter before being returned to their owner? (Round to 1 decimal place.)
round(mean(ownerSurrender$Days.Shelter),1)

## [1] 13.4

hist(ownerSurrender$Days.Shelter)

#1e. What would be the correct graph type to show the distribution of dog intake types?
qplot(animaldata$Intake.Type)

#bar graph

#Question 2
#A professor asked her students to report how much time they spent completing a take-home exam.  The minimum time reported was half an hour, and the maximum was six and a half hours.  She compiled the data into a relative frequency table, shown below.  Note that one value is missing.

#2a. A total of 6 students reported that they worked on the take-home exam for 5 hours
#or longer. What is the total number of students that reported their time?
6/.15

## [1] 40

#40 students/solve equation is/of * 100

#2b. How many students spent fewer than 3 hours completing the take-home exam?
100-20-20-15-15-10-5

## [1] 15

20+20+15

## [1] 55

#55% of the students spent less than 3 hours
.55*40

## [1] 22

#Answer: 22

#2c. If the professor wanted to report the center of this distribution, which measure of center should she use?
#median

#2d. Which bin includes the value of Q3?
#at least 4 hours but less than 5 hours


#Question 3

#Here is a list of ten data values that are sorted from smallest to largest, but five of the data values are missing.  The missing values are represented by A, B, C, D, and E.

#2     A     3     B     6     C     9     D     15     E

dataValues<-c(2,2,3,5,6,7, 9,13, 15,18)
mean(dataValues)

## [1] 8

median(dataValues)

## [1] 6.5

mode(dataValues)

## [1] "numeric"

range(dataValues)

## [1]  2 18

IQR(dataValues)

## [1] 8.5

#Using the statistics below, find the missing values for the data set.  

#Mean = 8     Median = 6.5     Mode = 2     Range = 16     IQR = 10

#3a
#A=2
#B=5
#c=7
#d=13
#e=18

#3b. Which of the statistics would change if each value in the dataset were increased by 2?
dataValuesIncreased<-2+c(2,2,3,5,6,7, 9,13, 15,18)

mean(dataValuesIncreased)

## [1] 10

median(dataValuesIncreased)

## [1] 8.5

mode(dataValuesIncreased)

## [1] "numeric"

range(dataValuesIncreased)

## [1]  4 20

IQR(dataValuesIncreased)

## [1] 8.5

#Mean, Median, Mode 

#3c. Which of the statistics would change if each value in the dataset were multiplied by 2?
dataValuesmultiplied<-2*c(2,2,3,5,6,7, 9,13, 15,18)

mean(dataValuesmultiplied)

## [1] 16

median(dataValuesmultiplied)

## [1] 13

mode(dataValuesmultiplied)

## [1] "numeric"

range(dataValuesmultiplied)

## [1]  4 36

IQR(dataValuesmultiplied)

## [1] 17

#Range, IQR, Mean, Median, Mode

#QUESTION 4.
#Suppose that hours of sleep per night for single adults between 30 and 40 years of age are normally distributed with a mean of 6.7 hours and a standard deviation of 1.1 hours.  

#COMPUTE Z-SCORE
#zscore<-1-pnorm(x, mean_of_dataset, standard_deviation)

#4a. If an adult has a z-score of -1.5, how many hours of sleep does this person get per night? (Report to 2 decimal places.)
#ANSWER = 5.05
round(5.05-6.7/1.1, digits = 2)

## [1] -1.04

#solved for X by hand (x-6.7)/1.1=-1.5
-1.5*1.1

## [1] -1.65

1.1*6.7

## [1] 7.37

7.37/1.1

## [1] 6.7

6.7+-1.65

## [1] 5.05

#4b. What proportion of adults sleep longer than 4.5 hours per night? (Report to 3 decimal places.)
adultsSleep<-round(1-pnorm(4.5, mean=6.7, sd=1.1),3)
adultsSleep

## [1] 0.977

#.977

#4c. What proportion of adults sleep between 5.38 and 8.79 hours of sleep? (Report to 3 decimal places.)
adults538<-round(1-pnorm(5.38, mean=6.7, sd=1.1),3)
adults538

## [1] 0.885

adults879<-round(1-pnorm(8.79, mean=6.7, sd=1.1),3)
adults879

## [1] 0.029

answer4c<-adults879-adults538
answer4c

## [1] -0.856

#or
zscore538<-(5.38-6.7)/1.1
zscore879<-(8.79-6.7)/1.1
zscore538

## [1] -1.2

zscore879

## [1] 1.9

#lookup in ztable zscore 538=.1151 zscore 8.78=.9713
answer4c<-round(.9713-.1151,3)
answer4c

## [1] 0.856

#.856


#Week 3- Bivariate Distributions

#Primary Research Question

#For the 2013 season, Is there a linear relationship between how often a rider
#placed in the Top 10 and the number of times he stayed on his bull for a full
#8 seconds?

#import dataset
bull<-fread("https://d37djvu3ytnwxt.cloudfront.net/assets/courseware/v1/f544a6f73ec30abf4c6675772ae5f8c7/asset-v1:UTAustinX+UT.7.11x+2T2017+type@asset+block/BullRiders.csv")

#1a. How many observations are in the dataset?
glimpse(bull)

## Observations: 58
## Variables: 44
## $ Rider              <chr> "Joao Ricardo Vieira", "Matt Triplett", "J....
## $ Rank15             <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, ...
## $ Country            <chr> "BRA", "USA", "USA", "BRA", "USA", "BRA", "...
## $ YearBorn           <int> 1984, 1991, 1987, 1994, 1990, 1979, 1982, 1...
## $ Height             <int> 66, 67, 70, 68, 73, 72, 70, 67, 68, 70, 70,...
## $ Weight             <int> 163, 160, 140, 145, 160, 170, 180, 150, 135...
## $ YearsPro           <int> 3, 4, 10, 2, 6, 9, 16, 7, 9, 10, 8, 14, 9, ...
## $ Events14           <int> 28, 28, 22, 1, 15, 28, 28, 28, 12, 14, 12, ...
## $ BuckOuts14         <int> 93, 86, 63, 1, 41, 81, 90, 92, 30, 37, 33, ...
## $ Rides14            <int> 41, 33, 25, 0, 17, 29, 41, 50, 8, 16, 9, 32...
## $ CupPoints14        <dbl> 9520.25, 7493.58, 4973.50, 1671.87, 3240.25...
## $ Rank14             <int> 2, 3, 4, 0, 31, 17, 8, 1, 54, 9, 29, 7, 5, ...
## $ RidePer14          <dbl> 0.4409, 0.3837, 0.3968, 0.0000, 0.4146, 0.3...
## $ RidesPer_45bull_14 <dbl> 0.0000, 0.2000, 0.5000, 0.0000, 0.0000, 0.0...
## $ Rides90pts_14      <int> 5, 2, 4, 0, 0, 1, 2, 1, 0, 2, 0, 0, 1, 0, 0...
## $ Wins14             <int> 2, 0, 2, 0, 1, 2, 1, 0, 0, 0, 0, 2, 3, 3, 0...
## $ Top5_14            <int> 10, 4, 5, 0, 1, 5, 6, 7, 1, 2, 1, 7, 8, 5, ...
## $ Top10_14           <int> 14, 11, 5, 0, 3, 8, 15, 12, 2, 4, 4, 9, 11,...
## $ FinalPoints14      <dbl> 1152.75, 2743.25, 4553.50, 0.00, 83.50, 0.0...
## $ Earnings14         <dbl> 328120.96, 258025.80, 497597.99, 32977.83, ...
## $ Events13           <int> 22, 9, 26, 0, 24, 19, 27, 27, 11, 0, 9, 27,...
## $ BuckOuts13         <int> 72, 26, 90, 0, 66, 53, 84, 91, 26, 0, 18, 8...
## $ Rides13            <int> 35, 11, 47, 0, 22, 25, 39, 50, 6, 0, 2, 24,...
## $ CupPoints13        <dbl> 8748.73, 2827.43, 10399.25, 0.00, 4304.41, ...
## $ Rank13             <int> 3, 26, 1, 0, 25, 16, 4, 2, 42, 0, 40, 10, 1...
## $ RidePer13          <dbl> 0.4861, 0.4231, 0.5222, 0.0000, 0.3333, 0.4...
## $ RidesPer_45bull_13 <dbl> 0.3750, 1.0000, 0.2667, 0.0000, 0.0000, 0.0...
## $ Rides90pts_13      <int> 4, 1, 8, 0, 0, 0, 0, 3, 0, 0, 0, 1, 1, 0, 1...
## $ Wins13             <int> 3, 0, 5, 0, 0, 0, 2, 3, 0, 0, 0, 1, 2, 1, 0...
## $ Top5_13            <int> 5, 1, 11, 0, 2, 6, 5, 11, 1, 0, 0, 2, 3, 4,...
## $ Top10_13           <int> 9, 2, 14, 0, 2, 6, 11, 13, 2, 0, 2, 6, 7, 6...
## $ FinalPoints13      <dbl> 1990.75, 1257.25, 5296.25, 0.00, 0.00, 435....
## $ Earnings13         <dbl> 466585.11, 89377.51, 1810710.75, 0.00, 5857...
## $ Events12           <int> 0, 0, 26, 0, 0, 28, 29, 29, 5, 0, 0, 20, 23...
## $ BuckOuts12         <int> 0, 0, 82, 0, 0, 87, 87, 103, 10, 0, 0, 64, ...
## $ Rides12            <int> 0, 0, 40, 0, 0, 53, 50, 62, 3, 0, 0, 33, 38...
## $ CupPoints12        <dbl> 0.00, 0.00, 9273.25, 0.00, 0.00, 10608.25, ...
## $ Rank12             <int> 0, 0, 8, 0, 0, 4, 2, 1, 53, 0, 0, 13, 10, 1...
## $ RidePer12          <dbl> 0.00, 0.00, 0.49, 0.00, 0.00, 0.61, 0.57, 0...
## $ Wins12             <int> 0, 0, 3, 0, 0, 1, 2, 2, 0, 0, 0, 1, 2, 1, 0...
## $ Top5_12            <int> 0, 0, 8, 0, 0, 10, 9, 8, 0, 0, 0, 5, 5, 4, ...
## $ Top10_12           <int> 0, 0, 13, 0, 0, 14, 12, 18, 1, 0, 0, 7, 11,...
## $ FinalPoints12      <dbl> 0.00, 0.00, 287.00, 0.00, 0.00, 559.50, 251...
## $ Earnings12         <dbl> 0.00, 0.00, 313340.27, 0.00, 0.00, 208724.5...

#1b. How many of the first 10 riders in the dataset have been pro for 10 years or more?
count(bull[1:10,], vars = "YearsPro")

##   YearsPro freq
## 1        2    1
## 2        3    1
## 3        4    1
## 4        6    1
## 5        7    1
## 6        9    2
## 7       10    2
## 8       16    1

#1c. Of the top 15 riders so far in 2015, how many rides were completed by the rider with the fewest buck-outs in 2014?
fewestBuckout<-sqldf('SELECT * FROM bull WHERE Rank15 >= 15 
      ORDER BY BuckOuts14 ASC LIMIT 1')

#2a. Which variable tells us how many times the rider
#has placed in the Top 10 at the end of the 2013 season?
#The variable name in the dataset is:
head(bull$Top10_13)

## [1]  9  2 14  0  2  6

#2b. What type of variable is this?
class(bull$Top10_13)

## [1] "integer"

#2c. Which variable tells us the number of times a rider stayed on his bull for the full 8 seconds in 2013?
#The variable name in the dataset is:
head(bull$Rides13)

## [1] 35 11 47  0 22 25

#2d. What type of variable is this?
class(bull$Rides13)

## [1] "integer"

#Reflect on the Method
#Which method should we be using for the analysis and why?

#3. We will use correlation to answer this lab question. Why?
#We want to explore a linear relationship between two quantitative variables.

#4. We should generate a scatterplot of these two variables before we continue our analysis. Why?
#We want to confirm that the relationship is linear.

#Here is the code you will use:
  
  #Subset for riders that participated in at least one event in 2013
  new_bull <- bull[bull$Events13  > 0 ,]

# Visualize and describe the first variable of interest
hist(new_bull$Rides13)

fivenum(new_bull$Rides13)

## [1]  0 11 19 25 50

mean(new_bull$Rides13)

## [1] 18.73171

sd(new_bull$Rides13)

## [1] 12.7319

# Visualize and describe the second variable of interest
hist(new_bull$Top10_13)

fivenum(new_bull$Top10_13)

## [1]  0  2  6  8 14

mean(new_bull$Top10_13)

## [1] 5.390244

sd(new_bull$Top10_13)

## [1] 4.036571

# Create a scatterplot
plot(new_bull$Rides13,new_bull$Top10_13)

# Add line of best fit
abline(lm(new_bull$Top10_13~new_bull$Rides13))

# Calculate the correlation coefficient
cor(new_bull$Rides13,new_bull$Top10_13)

## [1] 0.916606

# Create a correlation matrix 
#vars <- c("Top10_13", "Rides13")
#cor(new_bull[,vars])


#Problem set
#1. Which cases will be selected by this line of code?
new_bull <- bull[bull$Events13 > 0 ,]
head(new_bull)

##                   Rider Rank15 Country YearBorn Height Weight YearsPro
## 1:  Joao Ricardo Vieira      1     BRA     1984     66    163        3
## 2:        Matt Triplett      2     USA     1991     67    160        4
## 3:          J.B. Mauney      3     USA     1987     70    140       10
## 4:       Nathan Schaper      5     USA     1990     73    160        6
## 5: Valdiron de Oliveira      6     BRA     1979     72    170        9
## 6:     Guilherme Marchi      7     BRA     1982     70    180       16
##    Events14 BuckOuts14 Rides14 CupPoints14 Rank14 RidePer14
## 1:       28         93      41     9520.25      2    0.4409
## 2:       28         86      33     7493.58      3    0.3837
## 3:       22         63      25     4973.50      4    0.3968
## 4:       15         41      17     3240.25     31    0.4146
## 5:       28         81      29     5733.06     17    0.3580
## 6:       28         90      41     8501.06      8    0.4556
##    RidesPer_45bull_14 Rides90pts_14 Wins14 Top5_14 Top10_14 FinalPoints14
## 1:                0.0             5      2      10       14       1152.75
## 2:                0.2             2      0       4       11       2743.25
## 3:                0.5             4      2       5        5       4553.50
## 4:                0.0             0      1       1        3         83.50
## 5:                0.0             1      2       5        8          0.00
## 6:                0.0             2      1       6       15         86.00
##    Earnings14 Events13 BuckOuts13 Rides13 CupPoints13 Rank13 RidePer13
## 1:   328121.0       22         72      35     8748.73      3    0.4861
## 2:   258025.8        9         26      11     2827.43     26    0.4231
## 3:   497598.0       26         90      47    10399.25      1    0.5222
## 4:    77426.3       24         66      22     4304.41     25    0.3333
## 5:   142892.5       19         53      25     5238.25     16    0.4717
## 6:   271530.7       27         84      39     7959.57      4    0.4643
##    RidesPer_45bull_13 Rides90pts_13 Wins13 Top5_13 Top10_13 FinalPoints13
## 1:             0.3750             4      3       5        9       1990.75
## 2:             1.0000             1      0       1        2       1257.25
## 3:             0.2667             8      5      11       14       5296.25
## 4:             0.0000             0      0       2        2          0.00
## 5:             0.0000             0      0       6        6        435.50
## 6:             0.0000             0      2       5       11       2323.75
##    Earnings13 Events12 BuckOuts12 Rides12 CupPoints12 Rank12 RidePer12
## 1:  466585.11        0          0       0        0.00      0      0.00
## 2:   89377.51        0          0       0        0.00      0      0.00
## 3: 1810710.75       26         82      40     9273.25      8      0.49
## 4:   58577.22        0          0       0        0.00      0      0.00
## 5:  107942.10       28         87      53    10608.25      4      0.61
## 6:  241287.70       29         87      50    11542.00      2      0.57
##    Wins12 Top5_12 Top10_12 FinalPoints12 Earnings12
## 1:      0       0        0          0.00        0.0
## 2:      0       0        0          0.00        0.0
## 3:      3       8       13        287.00   313340.3
## 4:      0       0        0          0.00        0.0
## 5:      1      10       14        559.50   208724.5
## 6:      2       9       12       2519.75   331421.7

#riders that have completed more than zero Events in 2013

#2. What will appear in the scatterplot produced by this line of code?
plot(new_bull$Rides13,new_bull$Top10_13)

#There will be a single data point for each bull rider.

#3. Which value is not a possible output of the following line of code?
cor(new_bull$Rides13,new_bull$Top10_13)

## [1] 0.916606

#4. A correlation matrix allows you to calculate multiple correlation coefficients at a time. Here, we are only asking for the correlation between Rides13 and Top10_13. If you wanted to include other variables as well, how would you do that?
# Create a correlation matrix
#vars <-c("Top10_13", "Rides13")
#cor(new_bull[,vars])
#Add the variable names to the "vars" object.

#The following plot was produced using the code listed below:
  #bull<-BullRiders
plot(bull$Events12, bull$BuckOuts12)
abline(lm(bull$Events12~bull$BuckOuts12))

#5. In the above scatterplot, why does the line of best fit seem
#to not be going through the center of the scatterplot?
#(Refer to the code below and the dataset in R for help.)
abline(lm(bull$BuckOuts12 ~ bull$Events12))

#The "Events" and "BuckOuts" variables should be switched in the abline command.

#Problem Set
#What do the histogram and descriptive statistics tell us about the distribution of the Rides13 variable?

#1a. On average, a bull rider in 2013
#has how many rides? (Report the median
#becuase the histogram is not symmetrical.)
hist(new_bull$Rides13)

median(new_bull$Rides13)

## [1] 19

#Answer: 19

#1b. These bull riders made it into the Top 10 an 
#average of ________ times in 2013. (Hint: Remember again that the histogram is not symmetrical.)
hist(new_bull$Top10_13)

median(new_bull$Top10_13)

## [1] 6

#Answer 6

#What does the scatterplot show us?
#2a. The relationship looks linear, moderately strong, and positive.

#2b. It looks like bull riders that appear frequently in the Top 10 list tend to have a ________ number of successful rides.
#higher

# Calculate the correlation coefficient
cor(new_bull$Rides13,new_bull$Top10_13)

## [1] 0.916606

# Create a correlation matrix 
#vars <- c("Top10_13", "Rides13")
#cor(new_bull[,vars])


#3a. The correlation, rounded to three decimal places, 
#between the number of Top 10 appearances and the number
#of successful rides for 2013 is r =
round(cor(new_bull$Rides13,new_bull$Top10_13),3)

## [1] 0.917

#3b. How many times does this value appear in the
#correlation matrix? (Report as a numeral)
#this code is running an error
#vars <- c("Top10_13", "Rides13")
#cor(new_bull[,vars])

#create correlation matrix
fordatamatrix<-sqldf('SELECT Rides13, Top10_13 FROM new_bull')
cor(fordatamatrix)

##           Rides13 Top10_13
## Rides13  1.000000 0.916606
## Top10_13 0.916606 1.000000

#answer=2

#4. On the scatterplot, we see a data point with a
#fairly large residual. This rider had 22 rides,
#but he only placed in the Top 10 two times.
#This rider's data point falls ________ the line of best fit. 
#If his data followed the line of best fit, he should have 
#placed in the Top 10 about ________ times.
# Create a scatterplot
plot(new_bull$Rides13,new_bull$Top10_13)

# Add line of best fit
abline(lm(new_bull$Top10_13~new_bull$Rides13))

outlierRider<-sqldf('SELECT * FROM new_bull WHERE Rides13 = 22 AND Events13  > 0')
#answers: below and 6

#Use this code to help identify this rider:
#identify a specific record
which(new_bull$Top10_13==2 & new_bull$Rides13==22)

## [1] 4

number4<-new_bull[4,]

#5. After looking at the data for this rider, 
#can you explain why he has placed in the Top 10 so few times?
number4<-new_bull[4,]
#His ride percentage was only about 33%, which wasn't high enough to place him in the Top 10. 

#Write Your Conclusion
#There is a strong  positive
#linear relationship between the number of 8-second rides a bull rider completed
#and the total number of times he made it in the Top 10 after the 2013 season,
#(r =  .917 ). The average number of rides for these bull riders was around  
#19 .There were no significant outliers. One rider appeared to have placed
#in the top-10 rankings only twice, despite an above-average number of rides.
#Upon closer inspection, we could see that he did not have a very high ride percentage
#which might account for his few appearances in the Top 10.

round(cor(fordatamatrix),3)

##          Rides13 Top10_13
## Rides13    1.000    0.917
## Top10_13   0.917    1.000

#Lab 3: Professional Bull Riding
#http://www.pbr.com/en/bfts/standings/riders.aspx.

#Review of Correlation

#In this lab, you will use correlation to answer a question of interest. Let's start by remembering why we use correlation.
#1a. A correlation can tell us:
#ANSWER: the direction and strength of a linear relationship between two quantitative variables.

#1b. Look at the scatterplot below. Select the answer that best describes what would happen to the value of the correlation coefficient rxy if the circled point were removed from the analysis.
#The value of rxy would increase.

#LAB PREPARATION

#1. Load Data-BullRiders
bull<-fread("https://d37djvu3ytnwxt.cloudfront.net/assets/courseware/v1/f544a6f73ec30abf4c6675772ae5f8c7/asset-v1:UTAustinX+UT.7.11x+2T2017+type@asset+block/BullRiders.csv")

#2. One of the following questions will be answered in this lab using correlation. Select the question that can be answered with correlation.
#Which variable has the strongest linear relationship with earnings: successful ride percentage or Cup points? correct


#Let’s break this analysis into the different steps that you will need to take to construct a complete answer.  Be sure to:

#1. Create a dataset which contains riders that participated in at least one event in 2012. Call the dataset new_bull12.
new_bull2012 <- bull[bull$Events12  > 0 ,]

#2. Make a histogram to visualize the distribution of Earnings for 2012.
hist(new_bull2012$Earnings12)

#3. Generate the appropriate descriptive statistics for this distribution.
summary(new_bull2012)

##     Rider               Rank15         Country             YearBorn   
##  Length:29          Min.   :  3.00   Length:29          Min.   :1979  
##  Class :character   1st Qu.: 14.00   Class :character   1st Qu.:1983  
##  Mode  :character   Median : 28.00   Mode  :character   Median :1987  
##                     Mean   : 63.07                      Mean   :1985  
##                     3rd Qu.: 58.00                      3rd Qu.:1987  
##                     Max.   :279.00                      Max.   :1992  
##      Height       Weight         YearsPro     Events14       BuckOuts14   
##  Min.   :60   Min.   :120.0   Min.   : 4   Min.   : 1.00   Min.   : 3.00  
##  1st Qu.:67   1st Qu.:140.0   1st Qu.: 9   1st Qu.:19.00   1st Qu.:47.00  
##  Median :68   Median :150.0   Median :10   Median :23.00   Median :63.00  
##  Mean   :68   Mean   :150.7   Mean   :10   Mean   :21.14   Mean   :59.14  
##  3rd Qu.:69   3rd Qu.:162.0   3rd Qu.:11   3rd Qu.:26.00   3rd Qu.:75.00  
##  Max.   :72   Max.   :188.0   Max.   :17   Max.   :28.00   Max.   :92.00  
##     Rides14       CupPoints14         Rank14     RidePer14     
##  Min.   : 1.00   Min.   : 136.8   Min.   : 1   Min.   :0.1000  
##  1st Qu.:14.00   1st Qu.:2909.4   1st Qu.:15   1st Qu.:0.2698  
##  Median :21.00   Median :4708.4   Median :22   Median :0.3492  
##  Mean   :20.86   Mean   :4449.4   Mean   :25   Mean   :0.3310  
##  3rd Qu.:27.00   3rd Qu.:5778.3   3rd Qu.:35   3rd Qu.:0.3684  
##  Max.   :50.00   Max.   :9008.6   Max.   :61   Max.   :0.5455  
##  RidesPer_45bull_14 Rides90pts_14        Wins14          Top5_14     
##  Min.   :0.00000    Min.   :0.0000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.00000    1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:1.000  
##  Median :0.00000    Median :0.0000   Median :0.0000   Median :2.000  
##  Mean   :0.05521    Mean   :0.6897   Mean   :0.7586   Mean   :2.724  
##  3rd Qu.:0.00000    3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:5.000  
##  Max.   :0.50000    Max.   :4.0000   Max.   :3.0000   Max.   :8.000  
##     Top10_14      FinalPoints14      Earnings14         Events13   
##  Min.   : 0.000   Min.   :   0.0   Min.   :   1000   Min.   : 6.0  
##  1st Qu.: 3.000   1st Qu.:   0.0   1st Qu.:  53954   1st Qu.:19.0  
##  Median : 6.000   Median :  84.0   Median : 100297   Median :24.0  
##  Mean   : 5.483   Mean   : 477.9   Mean   : 167298   Mean   :21.1  
##  3rd Qu.: 8.000   3rd Qu.: 389.0   3rd Qu.: 170960   3rd Qu.:26.0  
##  Max.   :15.000   Max.   :4553.5   Max.   :1422603   Max.   :27.0  
##    BuckOuts13       Rides13       CupPoints13        Rank13     
##  Min.   :16.00   Min.   : 2.00   Min.   : 1678   Min.   : 1.00  
##  1st Qu.:54.00   1st Qu.:14.00   1st Qu.: 3350   1st Qu.:10.00  
##  Median :62.00   Median :22.00   Median : 4756   Median :20.00  
##  Mean   :60.17   Mean   :21.72   Mean   : 4948   Mean   :21.83  
##  3rd Qu.:76.00   3rd Qu.:26.00   3rd Qu.: 5995   3rd Qu.:33.00  
##  Max.   :91.00   Max.   :50.00   Max.   :10938   Max.   :44.00  
##    RidePer13      RidesPer_45bull_13 Rides90pts_13        Wins13      
##  Min.   :0.1250   Min.   :0.00000    Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.2879   1st Qu.:0.00000    1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.3387   Median :0.00000    Median :0.0000   Median :0.0000  
##  Mean   :0.3508   Mean   :0.03678    Mean   :0.7931   Mean   :0.7586  
##  3rd Qu.:0.4429   3rd Qu.:0.00000    3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :0.5495   Max.   :0.26670    Max.   :8.0000   Max.   :5.0000  
##     Top5_13          Top10_13      FinalPoints13      Earnings13     
##  Min.   : 0.000   Min.   : 1.000   Min.   :   0.0   Min.   :  25221  
##  1st Qu.: 1.000   1st Qu.: 3.000   1st Qu.:   0.0   1st Qu.:  75144  
##  Median : 2.000   Median : 6.000   Median : 212.0   Median : 102374  
##  Mean   : 3.241   Mean   : 6.483   Mean   : 703.1   Mean   : 189554  
##  3rd Qu.: 4.000   3rd Qu.: 9.000   3rd Qu.: 491.2   3rd Qu.: 155554  
##  Max.   :11.000   Max.   :14.000   Max.   :5296.2   Max.   :1810711  
##     Events12       BuckOuts12        Rides12       CupPoints12     
##  Min.   : 1.00   Min.   :  4.00   Min.   : 1.00   Min.   :  261.8  
##  1st Qu.:14.00   1st Qu.: 38.00   1st Qu.: 9.00   1st Qu.: 1408.8  
##  Median :23.00   Median : 64.00   Median :28.00   Median : 5062.5  
##  Mean   :20.79   Mean   : 60.93   Mean   :26.69   Mean   : 5528.2  
##  3rd Qu.:28.00   3rd Qu.: 82.00   3rd Qu.:38.00   3rd Qu.: 9273.2  
##  Max.   :29.00   Max.   :103.00   Max.   :62.00   Max.   :12201.8  
##      Rank12        RidePer12          Wins12          Top5_12      
##  Min.   : 1.00   Min.   :0.1900   Min.   :0.0000   Min.   : 0.000  
##  1st Qu.: 8.00   1st Qu.:0.2900   1st Qu.:0.0000   1st Qu.: 1.000  
##  Median :17.00   Median :0.3800   Median :1.0000   Median : 4.000  
##  Mean   :21.14   Mean   :0.3986   Mean   :0.8276   Mean   : 3.966  
##  3rd Qu.:34.00   3rd Qu.:0.5300   3rd Qu.:1.0000   3rd Qu.: 5.000  
##  Max.   :65.00   Max.   :0.6100   Max.   :3.0000   Max.   :10.000  
##     Top10_12     FinalPoints12      Earnings12     
##  Min.   : 0.00   Min.   :   0.0   Min.   :  21343  
##  1st Qu.: 3.00   1st Qu.:   0.0   1st Qu.:  66814  
##  Median : 7.00   Median : 186.5   Median : 147952  
##  Mean   : 7.31   Mean   : 769.3   Mean   : 201371  
##  3rd Qu.:11.00   3rd Qu.:1011.8   3rd Qu.: 239936  
##  Max.   :18.00   Max.   :4189.2   Max.   :1464476

#4. Make a correlation matrix for Earnings12, RidePer12 and CupPoints12.
corrMatrix3<-sqldf('SELECT Earnings12, RidePer12, CupPoints12 FROM new_bull2012')
round(cor(corrMatrix3),3)

##             Earnings12 RidePer12 CupPoints12
## Earnings12       1.000     0.593       0.657
## RidePer12        0.593     1.000       0.918
## CupPoints12      0.657     0.918       1.000

#5. Plot a scatterplot for Earnings12 with each variable of interest.  Put Earnings12 on the y-axis.  Check for outliers.
#6. Determine which variable has the strongest linear relationship with Earnings12.
plot(new_bull2012$RidePer12, new_bull2012$Earnings12)

plot(new_bull2012$CupPoints12, new_bull2012$Earnings12)

#1a. What is the shape of the Earnings distribution for 2012?
hist(new_bull2012$Earnings12)

#1b. What was the average amount earned by a bull rider? (Choose the appropriate measure of center; report without a $ sign and round to the nearest whole number.)
round(median(new_bull2012$Earnings12))

## [1] 147952

#1c. What was the highest amount earned by a bull rider? (Report without a $ sign and round to the nearest whole number.)
round(max(new_bull2012$Earnings12))

## [1] 1464476

#Make a Scatterplot of Earnings and Ride Percentage
#2a. Does the scatterplot show a linear relationship?
plot(new_bull2012$Earnings12, new_bull2012$RidePer12)
abline(lm(new_bull2012$RidePer12 ~ new_bull2012$Earnings12), col="red")

#yes

#create correlation matrix
corrMatrix<-sqldf('SELECT Earnings12, RidePer12 FROM new_bull2012')
round(cor(corrMatrix),3)

##            Earnings12 RidePer12
## Earnings12      1.000     0.593
## RidePer12       0.593     1.000

#Create a Scatterplot of Earnings and Cup Points
plot(new_bull2012$CupPoints12, new_bull2012$Earnings12)
abline(lm(new_bull2012$Earnings12 ~ new_bull2012$CupPoints12), col="red")

#3a. Does the scatterplot show a linear relationship?
#Yes
#create correlation matrix
corrMatrix3b<-sqldf('SELECT CupPoints12, Earnings12 FROM new_bull2012')
round(cor(corrMatrix3b),3)

##             CupPoints12 Earnings12
## CupPoints12       1.000      0.657
## Earnings12        0.657      1.000

#3b. What is the correlation of Earnings with Cup Points for 2012? (report to three decimal places)
corrMatrix3b<-sqldf('SELECT CupPoints12, Earnings12 FROM new_bull2012')
round(cor(corrMatrix3b),3)

##             CupPoints12 Earnings12
## CupPoints12       1.000      0.657
## Earnings12        0.657      1.000

#.657
#Outliers and Influential Points

#An outlier can have a significant impact on the correlation coefficient. Sometimes it is important to remove these points to examine the size of this impact. Run this code to identify the extreme data value in Earnings:
  # identify specific case
new_bull12<-new_bull2012
which(new_bull12$Earnings12 == max(new_bull12$Earnings12))

## [1] 4

#4a. The extreme earnings data point belonged to the rider that came in ______ Place in 2012. (Please spell your answer; do not use numerals.)
highEarner<-new_bull12[which(new_bull12$Earnings12 == max(new_bull12$Earnings12)),]
highEarner$Rank12

## [1] 1

#4b. Where does this data point fall in the scatterplot? (Make sure that Earnings12 is on the y-axis)
plot(new_bull2012$RidePer12, new_bull2012$Earnings12)
abline(lm(new_bull2012$Earnings12 ~ new_bull2012$RidePer12), col="red")

#above

#Subset the data
nooutlier <- new_bull12[new_bull12$Earnings12 < 1000000 ,] 

#Then rerun the correlation matrix and the scatterplots to see the difference. Make sure to use the new dataframe (nooutlier) that you just created.
corrMatrix4c<-sqldf('SELECT RidePer12, Earnings12 FROM nooutlier')
round(cor(corrMatrix4c),3)

##            RidePer12 Earnings12
## RidePer12      1.000      0.804
## Earnings12     0.804      1.000

#4d. After removing the outlier, what was the new correlation of Earnings and Cup Points for 2012? (Round to three decimals)
#create correlation matrix
corrMatrix4d<-sqldf('SELECT CupPoints12, Earnings12 FROM nooutlier')
round(cor(corrMatrix4d),3)

##             CupPoints12 Earnings12
## CupPoints12       1.000      0.893
## Earnings12        0.893      1.000

#4e. We would say that this data point was an influential point because it
#masked the strength of the relationships between Earnings and the other variable


#Write your conclusion
#An initial examination of the relationships between
#Ride Percentage (RidePer) and Earnings, and Cup 
#Points (CupPoints) and Earnings showed that Cup
#Points had the  stronger relationship to Earnings.
#Ride Percentage and Earnings showed a correlation
#value of  .593 while Cup Points and Earnings
#had a correlation value of .657. 
#Visual examination showed an  outlier in both
#relationships—a rider who has earned over 1 million
#dollars. Removal of this increased the initial 
#relationship: Ride Percentage and Earnings now had
#a correlation value of  .804 and Cup Points
#and Earnings had a correlation value of  .893 .
#Cup Points still had the higher relationship to Earnings.
#Visual examination showed a good linear relationships for both Ride
#Percentage and Cup Points, indicating the correct use
#of the correlation coefficient.

conclusion<-sqldf('SELECT RidePer12, CupPoints12, Earnings12 FROM nooutlier')
round(cor(conclusion),3)

##             RidePer12 CupPoints12 Earnings12
## RidePer12       1.000       0.910      0.804
## CupPoints12     0.910       1.000      0.893
## Earnings12      0.804       0.893      1.000

plot(nooutlier$RidePer12, nooutlier$Earnings12)
abline(lm(nooutlier$Earnings12 ~ nooutlier$RidePer12), col="red")

plot(nooutlier$CupPoints12, nooutlier$Earnings12)
abline(lm(nooutlier$Earnings12 ~ nooutlier$CupPoints12), col="blue")

#Problem Set

#Question 1

#During a professional bull-riding event, riders usually attempt to ride a bull three or more times.  This means that they can record a "ride" (successfully staying on the bull) multiple times in the same event.
#bull<-fread("https://d37djvu3ytnwxt.cloudfront.net/assets/courseware/v1/f544a6f73ec30abf4c6675772ae5f8c7/asset-v1:UTAustinX+UT.7.11x+2T2017+type@asset+block/BullRiders.csv")

#1.Subset the dataset for riders that had at least 1 ride in the 2014 season. Call this dataset new_bull.
new_bull<-sqldf('SELECT * FROM bull WHERE Rides14 > 0')

#2. Create a new variable or vector for the average number of rides per event for each bull rider in the new_bull dataset:
  RidesPerEvent14 <- new_bull$Rides14/new_bull$Events14

#3Make a histogram of your "rides per event" variable and find the five-number summary for your "rides per event" variable.
hist(RidesPerEvent14)

fivenum(RidesPerEvent14)

## [1] 0.2000000 0.6666667 1.0000000 1.1396104 2.0000000

summary(RidesPerEvent14)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.2000  0.6667  1.0000  0.9377  1.1380  2.0000

#1a. What is the minimum value? (Round to 2 decimal places.)
round(min(RidesPerEvent14),2)

## [1] 0.2

#1b. What is the median?
round(median(RidesPerEvent14),2)

## [1] 1

#1c. What is the maximum value? (Round to 2 decimal places.)
round(max(RidesPerEvent14),2)

## [1] 2

#1d. Create a scatterplot of "rides per event" and yearly ranking
#(defined by the "Rank14" variable) and add a line of best fit.
#Which of the following best describes the relationship between these two variables?
plot(RidesPerEvent14, new_bull$Rank14)
abline(lm(new_bull$Rank14 ~ RidesPerEvent14), col="blue")

cor(RidesPerEvent14,new_bull$Rank14)

## [1] -0.4945153

#Answer: The two variables have a negative linear relationship. 

#1e. What is the correlation coefficient for rides per event and yearly ranking? (Report to 3 decimal places)
round(cor(RidesPerEvent14,new_bull$Rank14),3)

## [1] -0.495

#1f. Suppose that college GPA and graduate school GPA have a correlation coefficient of 0.75.
#Based on this, what proportion of variation in graduate school GPA is left
#unexplained after taking college GPA into account? (Report to 4 decimal places)?
#r squared
1-(round(.75*.75,4))

## [1] 0.4375

#1g. Choose the correct scatterplot for each correlation coefficient.
#r = 1.0000
#Answer: A

#r = -0.790
#Answer: E

#r = 0.000
#Answer: F

#r = 0.950
#Answer: C

#1h. Suppose you have been given a gift card worth $30. 
#You wish to buy gloves that cost $3 per pair (including tax).
#Assuming that the price of gloves does not change, which scatterplot
#illustrates the relationship of the amount of money left on your gift card,
#given the number or pairs of gloves that you buy?
#Scatterplot B
#negative linear relationship

#1i. Suppose we are exploring the relationship between time spent walking
#and calories burned. We ask a group of people to begin walking.
#When each person stops, we record the number of calories burned, along
#with how many minutes he or she walked. Which scatterplot would best fit
#this relationship between calories burned and time spent walking?
#Scatterplot C
#not a perfect linear relationship but close

#Question 2
#Using the dataset below, find the correlation coefficient between time spent studying and exam grade.

minutes<-c(30, 45, 180, 95, 130, 140, 30, 80, 60, 110,  0,  80)
grade<- c(74, 68, 87, 90, 94, 84, 92, 88, 82, 93, 65, 90)    

#combine vectors into a matrix
study<-cbind(minutes, grade)

#2a. What is the correlation coefficient based on the data? (Round to 3 decimal places.)
round(cor(minutes, grade),3)

## [1] 0.597

cor(study)

##           minutes     grade
## minutes 1.0000000 0.5967026
## grade   0.5967026 1.0000000

#2b. Approximately what percentage of the variation in exam scores can be explained by the amount of time that each student studied?
#(Round to whole number without a % sign.)
round(cor(minutes, grade)*cor(minutes, grade),2)*100

## [1] 36

#2c. Create a scatterplot of the data (exam grades and time spent studying).
#What is the value of the outlier (the student that got a high grade but
#didn't study very long)?
plot(minutes,grade)
abline(lm(grade ~ minutes), col="blue")

#2d. When the outlier is removed, what is the new value of r? (Round to 3 decimal places.)
noOutlierminutes<-c(30, 45, 180, 95, 130, 140, 80, 60, 110, 0, 80)
noOutliergrade<- c(74, 68, 87, 90, 94, 84, 88, 82, 93, 65, 90) 
round(cor(noOutlierminutes, noOutliergrade),3)

## [1] 0.737

#2e. How did the outlier impact our efforts to assess the relationship between time spent studying and exam grades?
#The outlier caused the relationship to look weaker than it really is

Foundations_of_Data_Analysis_UT_Austin-Part_1A.R

anitaowens

Sun Jun 11 10:54:20 2017