#Weeks 4-6
#Week Four: Bivariate Distributions (Categorical Data)

#install library packages
library(ggplot2)
library(data.table)
library(dplyr)
## -------------------------------------------------------------------------
## data.table + dplyr code now lives in dtplyr.
## Please library(dtplyr)!
## -------------------------------------------------------------------------
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
####################Bivariate Distributions (Categorical Data)########
allmen<-68+94
allWomen<-109+89
all<-68+94+109+89
allAfraid<-68+109
allUnafraid<-94+89
menAfraid<-68
menUnafraid<-94
womenAfraid<-109
womenUnafraid<-89
#1) What fraction represents the proportion of people in the study that were afraid of heights?
177/360
## [1] 0.4916667
#2) How many simple events (outcomes) were possible for participants in this study?
4
## [1] 4
#3) Of those participants that were afraid of heights, what percentage were men? (Round to 1 decimal place with no % sign.)
menAfraid/allAfraid*100
## [1] 38.41808
#38.4


#4) What percentage of women were afraid of heights? (Round to 1 decimal place with no % sign.)
round(womenAfraid/allWomen,2)
## [1] 0.55
womenAfraid/allWomen*100
## [1] 55.05051
#55

#5) What percentage of all participants were not afraid of heights? (Round to 1 decimal place with no % sign.)
round(allUnafraid/all*100,1)
## [1] 50.8
allUnafraid/all*100
## [1] 50.83333
#50.8

#6) A contingency table is used to simultaneously display counts of
#two categorical variables.


#Independence & Conditional Probability

#Below is a contingency table showing data from a University of Texas Southwestern Medical Center study on Hepatitis C.


#1) How many simple events (outcomes) were possible for participants in this study?
6
## [1] 6
#2) What was the total number of participants in this study?
17+8+18+35+53+495
## [1] 626
#3) What was the marginal distribution for Hepatitis status in this study?
#43 had Hepatitis; 583 did not have Hepatitis.

#4) Overall, what percentage of participants had a tattoo? ( Round to 1 decimal place and do not include % sign.)
(52+61)/626*100
## [1] 18.05112
round((52+61)/626*100,1)
## [1] 18.1
#5) What percentage of those participants with Hepatitis C had a tattoo done in a commercial parlor? (Round to 1 decimal and do not include % sign.)
round(17/43*100,1)
## [1] 39.5
#6) What percentage of those who had a tattoo done in a commercial parlor have Hepatitis C? (Round to 1 decimal and do not include % sign.)
round(17/52*100,1)
## [1] 32.7
#7) What is the value of A?
#Probability=outcomes with Hepatitis/total outcoms in sample space = a/b=c
a = 43

#8) What is the value of B?
b = 626

#9) What is the value of C, the probability of randomly selecting a participant with Hepatitis? (Round to 3 decimal places.)
c = a/b
round(c,3)
## [1] 0.069
#10) In general, what must be true of P(A)?
#It must be between the values of 0 and 1, inclusive

#Graphing the Contingency Table
#Using the contingency table above, solve for each of the following probabilities. (Report as proportions rounded to 3 decimal places.)
#Is there an association between car color and marital status?

#1) P (black/white/silver)
allCars <- 40 + 22 + 19 + 45 + 10 + 12
allCars
## [1] 148
round(bws <- (22 + 10)/allCars,3)
## [1] 0.216
#2) P (married)
married <- 40+22+19
unmarried<-45+10+12
round(married/allCars, 3)
## [1] 0.547
#3) P (unmarried and black/white/silver car)
round(10/allCars,3)
## [1] 0.068
#4) P (red car | married)
round(40/81, digits = 3)
## [1] 0.494
40/(40+22+19)
## [1] 0.4938272
#5) P (married | red car)
round(40/(40+45),3)
## [1] 0.471
#6) What ratio would you use to solve for P(red car| unmarried) using the contingency table?
round(45/(45+10+12),3)
## [1] 0.672
#Now solve for P (red car | unmarried) using the formula below. (Probabilities should be reported as proportions rounded to 3 decimal places.)
#7) The probability of having a red car and being unmarried
round(45/148,3)
## [1] 0.304
#8) The probability of being unmarried
unmarried<-45+10+12
everyone<- 40+22+19+45+10+12
round(unmarried/everyone,3)
## [1] 0.453
#9) The probability of having a red car given that the person is unmarried
round(0.304/0.453,3) #answer 7 divided by answer 8
## [1] 0.671
#Using the above formula, solve for these three values. (Report each as a proportion rounded to three decimal places.)

#10) P (red)
#the probability of having a red car
round(85/148, digits = 3)
## [1] 0.574
#11) P (red | married)
#the probability of having a red car given that the person is married
#the probability of being married

40/148 #the intersection of having a red car and being married
## [1] 0.2702703
(40+22+19)/148 #the probability of being married
## [1] 0.5472973
round(0.2702703/0.5472973,3) #answer
## [1] 0.494
#12)  P (red | unmarried)
#the probability of having a red car given that the person is unmarried
round(0.304/0.453,3) #answer 7 divided by answer 8
## [1] 0.671
#13) If car color and marital status are independent, what should be true?
#P(red) = P(red | married) 

#14) Do car color and marital status appear to be independent?
#No, because the probability of having a red car is different for married and unmarried people, though the difference is small.

######################Lab 4: Austin City Limits#######################

#For artists age 30 or older, do female artists play different kinds of music on Austin City Limits than male artists?

#import data
acl<-fread("https://d37djvu3ytnwxt.cloudfront.net/assets/courseware/v1/2bad2d0d8a5d13fcc764f30b5ebcc9c5/asset-v1:UTAustinX+UT.7.11x+2T2017+type@asset+block/AustinCityLimits.csv")

#inspect data
str(acl)
## Classes 'data.table' and 'data.frame':   116 obs. of  14 variables:
##  $ Artist       : chr  "Aimee Mann" "Alabama Shakes" "Allen Toussaint" "Andrew Bird" ...
##  $ Year         : int  2008 2013 2009 2009 2007 2009 2010 2009 2003 2008 ...
##  $ Month        : chr  "November" "February" "January" "October" ...
##  $ Season       : chr  "fall" "winter" "winter" "fall" ...
##  $ Gender       : chr  "F" "F" "M" "M" ...
##  $ Age          : int  52 24 75 39 33 62 37 35 43 67 ...
##  $ Age.Group    : chr  "Fifties or Older" "Twenties" "Fifties or Older" "Thirties" ...
##  $ Grammy       : chr  "Y" "N" "N" "N" ...
##  $ Genre        : chr  "Singer-Songwriter" "Rock/Folk/Indie" "Jazz/Blues" "Rock/Folk/Indie" ...
##  $ BB.wk.top10  : int  0 1 NA 1 1 0 1 NA 1 0 ...
##  $ Twitter      : int  101870 73313 308634 56343 404439 3326 125758 8197 158647 690 ...
##  $ Twitter.100k : int  1 0 1 0 1 0 1 0 1 0 ...
##  $ Facebook     : int  113576 298278 10721 318313 1711685 27321 563505 18955 1381051 1715 ...
##  $ Facebook.100k: int  1 1 0 1 1 0 1 0 1 0 ...
##  - attr(*, ".internal.selfref")=<externalptr>
#1a. How many artists are in this dataset?
nrow(acl)
## [1] 116
#1b. How many of the first 10 artists in the dataset were Grammy winners?
first10 <- acl[1:10,]
sqldf('select count(*) from first10 where grammy = "Y" ')
## Loading required package: tcltk
## Warning: Quoted identifiers should have class SQL, use DBI::SQL() if the
## caller performs the quoting.
##   count(*)
## 1        4
#1c) What genre was played by the first female artist in the dataset who was over 60 years of age? Match the case of the genre as it appears in the dataset.
sqldf('select Artist, Age, Gender, Genre from acl where Age > 60 and Gender = "F" ')
##           Artist Age Gender      Genre
## 1 Bettye Lavette  67      F Jazz/Blues
## 2    Bonnie Rait  63      F    Country
## 3   Jeff Bridges  63      F    Country
#Check the Variables of Interest
#Let's find the variables we need to answer the question.
#2a. Which variable tells us the kind of music played by each artist? The variable name in the dataset is:
#Genre

#2b. What type of variable is this?
#categorical

#2c. Which variable tells us whether the lead singer or performer is male or not?
#Gender

#2d. What type of variable is this?
#Categorical

#Reflect on the Method
#3. We will generate a contingency table of genre and gender to help us with this analysis. Why?
#The table will show us how many male and female artists played each type of music.
aclTable<-table(acl$Gender, acl$Genre)
aclTable
##    
##     Country Jazz/Blues Rock/Folk/Indie Singer-Songwriter
##   F       7          6              12                10
##   M      11          7              56                 7
#4. We will compare marginal and conditional probabilities to determine if female and male artists tend to play different kinds of music. Why?
#We want to determine if two categorical variables are independent or not.

###########################################Prepare for the Analysis#####
#Prepare for the Analysis
#Primary Research Question
#For artists age 30 or older, do female artists play different kinds of music on Austin City Limits than male artists?

#Here is the code you will use:
  
  #Subset the data for artists age 30 or older
  older <-acl[acl$Age>=30,]

# Create tables of marginal distributions
genre <- table(older$Genre)
genre
## 
##           Country        Jazz/Blues   Rock/Folk/Indie Singer-Songwriter 
##                17                11                61                13
gender <- table(older$Gender)
gender
## 
##  F  M 
## 25 77
# Create contingency table 
twoway <- table(older$Gender,older$Genre)
twoway
##    
##     Country Jazz/Blues Rock/Folk/Indie Singer-Songwriter
##   F       6          4               8                 7
##   M      11          7              53                 6
addmargins(twoway)
##      
##       Country Jazz/Blues Rock/Folk/Indie Singer-Songwriter Sum
##   F         6          4               8                 7  25
##   M        11          7              53                 6  77
##   Sum      17         11              61                13 102
# Visualize the counts
#library(RColorBrewer) #Rcolorbrewer palette
twowayplot <- barplot(twoway, col = c("pink", "blue"), legend=T, beside=T, main = "Austin City Limits Music Genres by Gender")
text(1.5, 9,"6", col = "red") #Country--Female
text(2.5, 14, "11", col = "red") #Country-Male
text(4.5, 9, "4", col = "red") #Jazz/Blues--Female
text(5.5, 13, "11", col = "red") #Jazz/Blues--Male
text(7.5, 13, "8", col = "red") #Rock/Folk/Indie--Female
text(8.5, 56, "53", col = "red") #Rock/Folk/Indie--Male
text(10.5, 11, "7", col = "red") #Singer-Songwriter--Female
text(11.5, 10, "6", col = "red") #Singer-Songwriter--Male

# Calculate P(A): the probability of each genre being played
prop.table(genre)
## 
##           Country        Jazz/Blues   Rock/Folk/Indie Singer-Songwriter 
##         0.1666667         0.1078431         0.5980392         0.1274510
# Calculate P(A|B): the probability of each genre being played, given the artist’s gender
prop.table(twoway,1)
##    
##        Country Jazz/Blues Rock/Folk/Indie Singer-Songwriter
##   F 0.24000000 0.16000000      0.32000000        0.28000000
##   M 0.14285714 0.09090909      0.68831169        0.07792208
#1) How many columns will be present in the table generated by the following line of code?
gender <- table(acl$Gender)
gender
## 
##  F  M 
## 35 81
#2) This code produces a bar chart with both a legend and side-by-side bars for each gender:
barplot(twoway, legend=TRUE, beside=TRUE)

#What would the code look like if we wanted to keep the legend but stack the bars (instead of set them side-by-side)?
barplot(twoway, legend=TRUE)

#3) This line of code will produce four values, one for each genre of music:
prop.table(genre)
## 
##           Country        Jazz/Blues   Rock/Folk/Indie Singer-Songwriter 
##         0.1666667         0.1078431         0.5980392         0.1274510
#What value should you get if you sum the four values together?
.1666667+.1078431+0.5980392+.1274510
## [1] 1
#4) What does the value "1" refer to in this line of code:
prop.table(twoway,1)
##    
##        Country Jazz/Blues Rock/Folk/Indie Singer-Songwriter
##   F 0.24000000 0.16000000      0.32000000        0.28000000
##   M 0.14285714 0.09090909      0.68831169        0.07792208
?prop.table
#The number 1 references the first variable (gender) listed in the contingency table code.

#5) Suppose we ran the following code to find conditional probabilities. What error has caused the prop.table function to not work? (You may want to examine the dataset in R for help.)

#acl <- AustinCityLimits

gender_grammy <- table(acl$Gender, acl$Grammy)
addmargins(gender_grammy)
##      
##         N   Y Sum
##   F    21  14  35
##   M    46  35  81
##   Sum  67  49 116
#Conduct the Analysis in R

#1a. How many male artists are in the 30+ year old artist subset of the Austin City Limits dataset?
male30Plus<-sqldf('select count(*) from acl where Gender = "M" and Age >=30' )
male30Plus
##   count(*)
## 1       77
#1b. How many female artists are in the 30+ year old artist subset of the Austin City Limits dataset?
female30Plus<-sqldf('select count(*) from acl where Gender = "F" and Age >=30' )
female30Plus
##   count(*)
## 1       25
#2a. To determine the proportion of jazz performers that were male, you would divide ________ by ________. (Enter responses in the order listed in the quesiton.)
twoway
##    
##     Country Jazz/Blues Rock/Folk/Indie Singer-Songwriter
##   F       6          4               8                 7
##   M      11          7              53                 6
addmargins(twoway)
##      
##       Country Jazz/Blues Rock/Folk/Indie Singer-Songwriter Sum
##   F         6          4               8                 7  25
##   M        11          7              53                 6  77
##   Sum      17         11              61                13 102
7/11
## [1] 0.6363636
#2b. To determine the proportion of males that performed jazz, you would divide ________ by ________. (Enter responses in the order listed in the quesiton.)
addmargins(twoway)
##      
##       Country Jazz/Blues Rock/Folk/Indie Singer-Songwriter Sum
##   F         6          4               8                 7  25
##   M        11          7              53                 6  77
##   Sum      17         11              61                13 102
7/77
## [1] 0.09090909
#3a. Which table should you look at to determine how many artists performed rock/folk/indie music: genre or gender?
genre
## 
##           Country        Jazz/Blues   Rock/Folk/Indie Singer-Songwriter 
##                17                11                61                13
#3b. How many artists performed rock/folk/indie music?
61
## [1] 61
#4a. Which of these lines of code provides the probability that a randomly selected artist from the dataset performed rock/folk/indie music?
prop.table(genre)
## 
##           Country        Jazz/Blues   Rock/Folk/Indie Singer-Songwriter 
##         0.1666667         0.1078431         0.5980392         0.1274510
#4b. What is the probability that a randomly selected artist from the dataset performed rock/folk/indie music?
.598
## [1] 0.598
#5a. Which of these lines of code provides the probability that a randomly selected female artist performed rock/folk/indie music?
prop.table(twoway,1)
##    
##        Country Jazz/Blues Rock/Folk/Indie Singer-Songwriter
##   F 0.24000000 0.16000000      0.32000000        0.28000000
##   M 0.14285714 0.09090909      0.68831169        0.07792208
#5b. What is the probability that a randomly selected female artist performed rock/folk/indie music?
.320
## [1] 0.32
#6. For genre and gender to be independent, which of the following statements must be true?
#P(rock) = P(rock|female) 

###################Write Your Conclusions#####################

#The music played on Austin City Limits was grouped into four genres,
#including Country, Jazz, Rock and Singer-Songwriter. We wanted to 
#examine only those artists who were age 30 or older. Rock was
#the most frequently played genre, performed by  59.8 % of the
#Austin City Limits artists. Among female artists, however, only 32
#% of the artists played Rock music. This difference between the marginal
#and conditional probabilities suggests that gender and genre
#are not independent. This difference was also evident in the bar plots,
#where it was evident that females were more likely to perform in the Singer/Songwriter
#category than their male counterparts.

over30<-sqldf('select * from acl where Age >=30' )
head(over30)
##                Artist Year    Month Season Gender Age        Age.Group
## 1          Aimee Mann 2008 November   fall      F  52 Fifties or Older
## 2     Allen Toussaint 2009  January winter      M  75 Fifties or Older
## 3         Andrew Bird 2009  October   fall      M  39         Thirties
## 4         Arcade Fire 2007 November   fall      F  33         Thirties
## 5 Asleep at the Wheel 2009 November   fall      M  62 Fifties or Older
## 6      Avett Brothers 2010  January winter      M  37         Thirties
##   Grammy             Genre BB.wk.top10 Twitter Twitter.100k Facebook
## 1      Y Singer-Songwriter           0  101870            1   113576
## 2      N        Jazz/Blues          NA  308634            1    10721
## 3      N   Rock/Folk/Indie           1   56343            0   318313
## 4      Y   Rock/Folk/Indie           1  404439            1  1711685
## 5      Y           Country           0    3326            0    27321
## 6      N   Rock/Folk/Indie           1  125758            1   563505
##   Facebook.100k
## 1             1
## 2             0
## 3             1
## 4             1
## 5             0
## 6             1
over30genre<-table(over30$Genre)
round(prop.table(over30genre)*100,1)
## 
##           Country        Jazz/Blues   Rock/Folk/Indie Singer-Songwriter 
##              16.7              10.8              59.8              12.7
female30plus<-sqldf('select * from acl where Gender = "F" and Age >=30' )
head(female30plus)
##           Artist Year    Month Season Gender Age        Age.Group Grammy
## 1     Aimee Mann 2008 November   fall      F  52 Fifties or Older      Y
## 2    Arcade Fire 2007 November   fall      F  33         Thirties      Y
## 3 Bettye Lavette 2008  October   fall      F  67 Fifties or Older      N
## 4          Bjork 2007  October   fall      F  47          Forties      N
## 5    Bonnie Rait 2012 November   fall      F  63 Fifties or Older      Y
## 6 Brandi Carlile 2010 November   fall      F  32         Thirties      N
##               Genre BB.wk.top10 Twitter Twitter.100k Facebook
## 1 Singer-Songwriter           0  101870            1   113576
## 2   Rock/Folk/Indie           1  404439            1  1711685
## 3        Jazz/Blues           0     690            0     1715
## 4 Singer-Songwriter           1  450096            1  2754505
## 5           Country           1   18683            0   357770
## 6   Rock/Folk/Indie           1   20549            0   223872
##   Facebook.100k
## 1             1
## 2             1
## 3             0
## 4             1
## 5             1
## 6             1
over30females<-table(female30plus$Genre)
prop.table(over30females)
## 
##           Country        Jazz/Blues   Rock/Folk/Indie Singer-Songwriter 
##              0.24              0.16              0.32              0.28
prop.table(twoway,1)*100
##    
##       Country Jazz/Blues Rock/Folk/Indie Singer-Songwriter
##   F 24.000000  16.000000       32.000000         28.000000
##   M 14.285714   9.090909       68.831169          7.792208
##################Review of Conditional and Marginal Probability#########
#1a. The probability that an event will occur, given that a different event has also occurred, is known as:
#a conditional probability

#1b. Which of the following must be true for two events, A and B, to be considered independent?
#P(A)=P(A|B)

#Lab Preparation

#Primary Research Question
#Among male artists, is there an association between winning a Grammy and the genre of music that he plays?

#2. One of the followlling questions will be answered in this lab by comparing marginal and conditional probabilities. Select the question that should be answered using this method:
#Among male artists, is there an association between winning a Grammy award and the genre of music that you play?

#1. Subset the data (males only).
males<-sqldf('select * from acl where Gender = "M" ')
#2. Create a table to show the marginal distributions for Genre and Grammy.
#3. Create a contingency table to show the conditional distribution for Genre and Grammy.
#4. Make a bar chart to better visualize how many artists in each Genre received a Grammy.
#5. Calculate P(A):  the probability of winning a Grammy.
#6. Calculate P(A|B): the probability of winning Grammy, given the artist's Genre.
#7. Interpret what these probabilities tell us about the relationship between Genre and winning a Grammy.

#rm(list = ls())  # Clean up

#import data
acl<-fread("https://d37djvu3ytnwxt.cloudfront.net/assets/courseware/v1/2bad2d0d8a5d13fcc764f30b5ebcc9c5/asset-v1:UTAustinX+UT.7.11x+2T2017+type@asset+block/AustinCityLimits.csv")

#Contingency Table
#Note: All of the questions below deal with the data frame that contains males only.
#1) How many male artists won a Grammy?

maleGrammywinners<-sqldf('select count(*) from acl where Gender = "M" and Grammy = "Y" ')
maleGrammywinners
##   count(*)
## 1       35
#2) How many male artists did not win a Grammy?

maleGrammyNOwinners<-sqldf('select count(*) from acl where Gender = "M" and Grammy = "N" ')
maleGrammyNOwinners
##   count(*)
## 1       46
#3) Which genre had the greatest number of Grammy wins?
genreGrammy<-sqldf('select genre, count(*) from acl where Grammy= "Y" group by genre')
genreGrammy
##               Genre count(*)
## 1           Country       12
## 2        Jazz/Blues        6
## 3   Rock/Folk/Indie       26
## 4 Singer-Songwriter        5
#4) What is the probability that a randomly selected artist was a Grammy winner? (Report as a proportion rounded to three decimal places)
grammy<-table(males$Grammy)
grammy
## 
##  N  Y 
## 46 35
46+35
## [1] 81
#the number of grammy winners divided by the total number of artists
round(35/81,3)
## [1] 0.432
#5) To determine the probability of winning a Grammy if the artist was a singer-songwriter, you would divide _______ by ________. (Enter numerical values.)
male_genre_grammy<-table(males$Grammy, males$Genre)
addmargins(male_genre_grammy)
##      
##       Country Jazz/Blues Rock/Folk/Indie Singer-Songwriter Sum
##   N         4          3              34                 5  46
##   Y         7          4              22                 2  35
##   Sum      11          7              56                 7  81
prop.table(male_genre_grammy)
##    
##        Country Jazz/Blues Rock/Folk/Indie Singer-Songwriter
##   N 0.04938272 0.03703704      0.41975309        0.06172840
##   Y 0.08641975 0.04938272      0.27160494        0.02469136
barplot(male_genre_grammy, legend=T, beside=T, main = "How many artists in each genre won a grammy", col = c("red", "green"))

#The number of singer-songwriter grammy winners divided by total number of singer-sonwiters in the category
2/7
## [1] 0.2857143
#6) To determine the probability that a randomly-selected Grammy winner was a singer-songwriter, you would divide ________ by ________. (Enter numerical values.)
#num of grammy winners in songwriting category divided
#by the total number of grammy winners
round(2/35, digits = 3)
## [1] 0.057
#Conditional Probabilities
#What is the probability that a randomly selected male artist from each of the following genres won a Grammy? (Report as proportions rounded to three decimal places.)
#7a. Country?
round(7/11,3)
## [1] 0.636
#7b. Jazz?
round(4/7,3)
## [1] 0.571
#7c. Rock/folk/indie?
round(22/56,3)
## [1] 0.393
#7d. Singer-songwriter?
round(2/7,3)
## [1] 0.286
#7e. Our conclusion should be that winning a Grammy _______ independent of Genre.
#IS Not


#Write Your Conclusion
#Among male artists, is there an association between winning a Grammy and the genre of music that he plays?

#There is an association between winning a Grammy and the Genre of music an artist plays.
#The probability of winning a Grammy, regardless of Genre, is 43.21%
#However, examination of a contingency table containing both Grammy
#and Genre showed that the conditional probability of winning a Grammy
#changes by genre. If an artist is in the Country Genre, the conditional
#probability of winning a Grammy is  63.64%, while if an artist is in the
#Singer/Songwriter Genre, the conditional probability of winning a Grammy
#is 28.6%. Visual examination of the barplot shows the conditional probabilities of winning a Grammy
#are not equal across Genres.

#Question 1

#You want to see if an artist's popularity on Facebook
#(whether or not they have 100,000 or more likes) has
#anything to do with their age.

#1.  Generate a table to show the number of artists that are "popular" and those that are not.
pop<-table(acl$Facebook.100k)
pop
## 
##  0  1 
## 31 85
#1=pop
#0=not popular


#2.  Generate a table to show the number of "popular" artists within each age group.
age<-table(acl$Facebook.100k, acl$Age.Group)
age
##    
##     Fifties or Older Forties Thirties Twenties
##   0               13       6        9        3
##   1               19      24       31       11
#1a. How many artists in the dataset have 100,000 or more likes on Facebook?
pop
## 
##  0  1 
## 31 85
#1b. Which age group has the highest number of artists that have 100,000 or more likes on Facebook? (Spell out your answer, i.e. twenties, thirties, forties, etc.)
age
##    
##     Fifties or Older Forties Thirties Twenties
##   0               13       6        9        3
##   1               19      24       31       11
#thirties

#1c. For each age group, fill in the proportion of artists who have 100,000 or more likes on Facebook. (Use the appropriate function in R to calculate these, and round to 3 decimal places (i.e. 0.123.)
age
##    
##     Fifties or Older Forties Thirties Twenties
##   0               13       6        9        3
##   1               19      24       31       11
addmargins(age)
##      
##       Fifties or Older Forties Thirties Twenties Sum
##   0                 13       6        9        3  31
##   1                 19      24       31       11  85
##   Sum               32      30       40       14 116
prop.table(age)
##    
##     Fifties or Older    Forties   Thirties   Twenties
##   0       0.11206897 0.05172414 0.07758621 0.02586207
##   1       0.16379310 0.20689655 0.26724138 0.09482759
#Twenties = ____
round(11/14,3)
## [1] 0.786
.786
## [1] 0.786
#Thirties = ____
round(31/40,3)
## [1] 0.775
.775
## [1] 0.775
#Forties = ____
round(24/30,3)
## [1] 0.8
.8
## [1] 0.8
#Fifties or older = ____
round(19/32,3)
## [1] 0.594
.594
## [1] 0.594
#Question 2
#A high school counselor wants to categorize students according to two variables: their gender (male or female) and their grade level (freshman, sophomore, junior or senior).

#2a. To how many possible outcomes can students be assigned?
#An Economics professor kept a record of grades earned by the college students in his class, as shown below. (In the US, an "A" is the strongest grade and an "F" is the Weakest.)
(4*2)
## [1] 8
8
## [1] 8
#2b. What proportion of students in the class received a grade of A? (Round to 2 decimal places.)
a <- 5+8+11+9
totalStudents <-5+8+11+9+10+10+5+9+9+9+4+4+10+7+4+2+6+4+2+0
round(33/128,2)
## [1] 0.26
#2c. What proportion of the students were upperclassmen (juniors and seniors)? (Round to 2 decimal places.)
juniorSenior<- 11+5+4+4+2+9+9+4+2+0
juniorSenior
## [1] 50
round(50/128,2)
## [1] 0.39
#2d. What is the probability that a freshman received a failing grade of F? (Round to 2 decimal places.)
freshman<-5+10+9+10+6
freshman
## [1] 40
round(6/40,2)
## [1] 0.15
#2e. What is the probability that a randomly selected student from the class would be a
#sophomore that received a grade of B? (Round to 2 decimal places.)
#P(A|B)

round(10/128,2)
## [1] 0.08
#2f. What proportion of juniors passed the course with a grade of D or better? (Round to 2 decimal places.)
juniors<-11 + 5 + 4 +4 +2
juniors
## [1] 26
goodJuniors<- 11 + 5 + 4+ 4
goodJuniors
## [1] 24
round(24/26,2)
## [1] 0.92
#2g. What is the probability that a randomly selected student from this class would be a senior? (Report to 2 decimal places.)
#seniors/total num of students
seniors<- 9 + 9 + 4 + 2 + 0
seniors
## [1] 24
totalStudents
## [1] 128
round(24/128,2)
## [1] 0.19
#2h. If a student received a grade of D in the class, what is the probability that the student was a senior? (Round to 2 decimal places.)
d <- 10 + 7 + 4 + 2
d
## [1] 23
round(2/23,2)
## [1] 0.09
#2i. Does the probability that a randomly selected student is a senior change if we know that the student received a grade of D in the course?
#YEs

#Use the below probability statements to answer the following two questions. Report answers as proportions.

#P(A) = 0.35
#P(A and B) = 0.15  OR p(a)*p(b|a)

#3a. If A and B are independent, what is the value of P(A|B)? (Round to 2 decimal places.)
.35
## [1] 0.35
#3b. What is the probability of P(B|A)? (Round to 2 decimal places.)
browseURL("http://www.mathgoodies.com/lessons/vol6/conditional.html")
round(.15/.35,2)
## [1] 0.43
#Question 4

#A movie theater conducted a survey to determine the
#movie preferences of men and women. They asked a total
#of 130 adults (50 women and 80 men) to choose their
#favorite movie genre out of four choices: Action,
#Comedy, Horror, or Romance. The results of their
#survey are shown below.

#4a. Based on the above information, do Gender and Movie preferences appear to be independent?
#No

#4b. What is the probability that a randomly chosen person from the survey prefers Action films? (Report as a proportion rounded to 2 decimal places.)
(.12*50) + (.35*80)
## [1] 34
round(34/130,2)
## [1] 0.26
#4c. What is P(Action|Women)? (Report as a proportion rounded to 2 decimal places.)
.12
## [1] 0.12
##################Linear Functions#############################
#Week 5: Linear Functions

#Comprehension Check
#1. A national park contains foxes that prey on rabbits.  The table below gives the two populations, F and R, over an 11-month period, where t=0 means January, t=1 means February, and so on. 
#1a. Is F a function of T?
month <-c(0:10) 
rabbits <- c(1000,750,  567,    500,    567,    750,    1000,   1250,   1433,   1500,   1433)
foxes <- c(150, 143,    125,    100,    75, 57, 50, 57, 75, 100,    125)
myData <- cbind(month, rabbits, foxes)
as.data.frame(myData)
##    month rabbits foxes
## 1      0    1000   150
## 2      1     750   143
## 3      2     567   125
## 4      3     500   100
## 5      4     567    75
## 6      5     750    57
## 7      6    1000    50
## 8      7    1250    57
## 9      8    1433    75
## 10     9    1500   100
## 11    10    1433   125
plot(myData)

#YEs, because for each value of t, there is exactly one value of F.

rm(list = ls())  # Clean up

#1b. Is R a function of F?
#No, because when F=57, R= 750 and R=1250

#2. A mathematical model is a function used to describe how data is behaving in
#an actual situation

#3.3a. Is the number of female senators, S, a function of the session of Congress, C?
#Yes, because for each session of Congress, there is exactly one number of female senators.

#3b. Let f(C) represent the number of female senators serving in the Cth Congress. What does the statement f(108)=14 mean?
#In the 108th Congress, there were 14 female senators.

#######################LINE OF BEST FIT######################

#1. What is a residual? Select all that apply.
#the distance between a data point in a scatterplot and the line of best fit
#e=y-ŷ

#2. Do states with higher populations have more millionaires?  Here is data from 2008. The variable labeled "Population" in the table and scatterplot will be referred to as "State.Population" in the questions that follow in order to avoid confusion with the meaning of "population" as a concept in statistics.
#State  <- c("Connecticut", "Delaware", "Maine",
           #"Massachusetts", "New Hampshire", "New Jersey",
            #"New York", "Pennsylvania", "Rhode Island",
            #"Vermont")
#Millionaires <- as.numeric(c(86, 18, 22, 141, 26, 207, 368, 228, 20))
#Population <- as.numeric(c(35,8, 13, 64, 13, 87, 193, 124, 11, 6))
#population<-cbind(State, Millionaires, Population)

population<-read.csv("population.csv")

#2a. What is the correlation between Millionaires and State.Population? (Round to 3 decimal places.)
corr<-cor(population$Millionaires, population$Population)
round(corr,3)
## [1] 0.992
#2b. What is the coefficient of determination? (Round to 3 decimal places.)
round(corr*corr,3)
## [1] 0.985
# Multiple Linear Regression Example 
#fit <- lm(y ~ x1 + x2 + x3, data=mydata)
#summary(fit) # show results


############Interpreting the Linear Model######################

#1a. What is the interpretation of ŷ for this model, if y represents the variable, Millionaires?
plot(population$Population, population$Millionaires)

pop.lm <- lm(Millionaires ~ Population, data = population)
summary(pop.lm)
## 
## Call:
## lm(formula = Millionaires ~ Population, data = population)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -16.457  -8.592  -6.042   7.922  33.607 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.29600    6.83972   0.921    0.384    
## Population   1.92065    0.08414  22.826 1.44e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.83 on 8 degrees of freedom
## Multiple R-squared:  0.9849, Adjusted R-squared:  0.983 
## F-statistic:   521 on 1 and 8 DF,  p-value: 1.439e-08
plot(pop.lm) #plots 4 different plots

coef(summary(pop.lm)) # show regression coefficients table
##             Estimate Std. Error   t value     Pr(>|t|)
## (Intercept)  6.29600 6.83971704  0.920506 3.842257e-01
## Population   1.92065 0.08414394 22.825766 1.438760e-08
##It is predicted number of millionaires, based on a population in a state.

#1b. This linear model crosses the y-axis at 6.296. What is the interpretation of this point?
#A state with a population of 0 is expected to have 6,296 millionaires

#1c. You create a new variable, subtracting the lowest Population
#value in the sample from each Population value:
#new_pop <- State.Population – min(State.Population) #example from class
population$new_pop <- population$Population - min(population$Population) #from mydataset #add new column to df

#This gives a new result from linFit():
  #Millionaires=17.82 + (1.921 * State.Population)

#1d. What is the interpretation of 17.82 in this model?
#On average, a state with a population equal to the lowest population has 17,820 millionaires.

#1e. Interpret 1.921 in the above model (with an intercept of 17.82).
#As the population of a state increases by 100,000, they will gain 1,921 millionaires. 

############Lab 4: Track and Field World Records (Linear Functions)#######

#import data
WR<-fread("https://d37djvu3ytnwxt.cloudfront.net/assets/courseware/v1/9f6060f531771fa15d177b4dad023b09/asset-v1:UTAustinX+UT.7.11x+2T2017+type@asset+block/WorldRecords.csv")

#check data using glimpse function
glimpse(WR)
## Observations: 285
## Variables: 7
## $ Event       <chr> "Mens 100m", "Mens 100m", "Mens 100m", "Mens 100m"...
## $ Type        <chr> "time", "time", "time", "time", "time", "time", "t...
## $ Record      <dbl> 10.06, 10.03, 10.02, 9.95, 9.93, 9.92, 9.90, 9.86,...
## $ Athlete     <chr> "Bob Hayes", "Jim Hines", "Charles Greene", "Jim H...
## $ Nationality <chr> "United States", "United States", "United States",...
## $ Location    <chr> "Tokyo, Japan", "Sacramento, USA", "Mexico City, M...
## $ Year        <int> 1964, 1968, 1968, 1968, 1983, 1988, 1991, 1991, 19...
#1a. How many different types of events (e.g. "Mens 100m," "Womens shotput," etc.) are represented in the dataset?
print(events<-sqldf('SELECT COUNT(DISTINCT event) FROM WR'))
##   COUNT(DISTINCT event)
## 1                    10
print(list_events<-sqldf('SELECT DISTINCT event FROM WR'))
##              Event
## 1        Mens 100m
## 2      Womens 100m
## 3        Mens 800m
## 4      Womens 800m
## 5  Mens TripleJump
## 6        Mens Mile
## 7      Womens Mile
## 8   Mens Polevault
## 9     Mens Shotput
## 10  Womens Shotput