##Installing Packages

#install.packages("devtools")
#install.packages("knitr")
#install.packages("rmarkdown")
#install.packages("ggplot2")
#install.packages("tidyverse")
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.4     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.1.1     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(here)
## here() starts at C:/Users/user/Desktop
kobe_basket = read.csv("C:/Users/user/Documents/kobe_HM.csv")

Finding the information

glimpse(kobe_basket)
## Rows: 133
## Columns: 6
## $ ï..vs       <chr> "ORL", "ORL", "ORL", "ORL", "ORL", "ORL", "ORL", "ORL", "O~
## $ game        <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
## $ quarter     <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1", "2", "2", "2"~
## $ time        <chr> "9:47:00", "9:07:00", "8:11:00", "7:41:00", "7:03:00", "6:~
## $ description <chr> "Kobe Bryant makes 4-foot two point shot", "Kobe Bryant mi~
## $ shot        <chr> "H", "M", "M", "H", "H", "M", "M", "M", "M", "H", "H", "H"~


##1.) Verify this data by looking at the first 8 rows of the data.
##Reading data of first 8 rows

head(kobe_basket,8)
##   ï..vs game quarter    time
## 1   ORL    1       1 9:47:00
## 2   ORL    1       1 9:07:00
## 3   ORL    1       1 8:11:00
## 4   ORL    1       1 7:41:00
## 5   ORL    1       1 7:03:00
## 6   ORL    1       1 6:01:00
## 7   ORL    1       1 4:07:00
## 8   ORL    1       1 0:52:00
##                                               description shot
## 1                 Kobe Bryant makes 4-foot two point shot    H
## 2                               Kobe Bryant misses jumper    M
## 3                        Kobe Bryant misses 7-foot jumper    M
## 4 Kobe Bryant makes 16-foot jumper (Derek Fisher assists)    H
## 5                         Kobe Bryant makes driving layup    H
## 6                               Kobe Bryant misses jumper    M
## 7                       Kobe Bryant misses 12-foot jumper    M
## 8                       Kobe Bryant misses 19-foot jumper    M
kobe_basket$shot[1:8]
## [1] "H" "M" "M" "H" "H" "M" "M" "M"

##The sequence of hits and misses for kobe are one, zero,two, zero,zero
##2.) What does a streak length of 1 mean, i.e. how many hits and misses are in a streak of 1? What about a streak length of 0?

calc_streak <- function(x){
y <- rep(0,length(x))
y[x == "H"] <- 1
y <- c(0, y, 0)
wz <- which(y == 0)
streak <- diff(wz) - 1
return(streak)
}
kobe_streak <- tibble(length = calc_streak(kobe_basket$shot))

##The streak length of 1 means one hit ‘H’ followed by miss ‘M’
##The streak length of 0 means zero ‘H’
##3.) Create a barplot of the length variable using ggplot2.

library(ggplot2)
ggplot(kobe_streak, aes(x=length)) +
geom_bar(fill="dark blue") + ggtitle(" Barplot using Length variable")

##4.) Describe the distribution of Kobe’s streak lengths from the 2009 NBA finals. What was his typical streak length? How long was his longest streak of baskets? Make sure to include the accompanying plot in your answer.

kobe_streak <- calc_streak(kobe_basket$shot)
barplot(table(kobe_streak), main = " Barplot displaying Kobe streak lengths in NBA finals", xlab = " length", ylab="count" )

##The barplot shows the distribution is right skewed and it is unimodel distribution.

kobe_streak <- calc_streak(kobe_basket$shot)
boxplot(table(kobe_streak), main = " Boxplot showing Kobe streak length in NBA Final", xlab = "length", ylab= "count")


##The boxplot shows the distribution is unimodel.
##So the distribution of kobe streak is unimodel.

summary(kobe_streak)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.7632  1.0000  4.0000

##The typical streak length is 0.000
##The longest streak of baskets was 4.

coin_outcomes <- c("heads", "tails")
sample(coin_outcomes, size = 1, replace = TRUE)
## [1] "tails"
sim_fair_coin <- sample(coin_outcomes, size = 100, replace = TRUE)
sim_fair_coin
##   [1] "heads" "heads" "heads" "heads" "heads" "heads" "tails" "tails" "tails"
##  [10] "heads" "tails" "tails" "heads" "heads" "tails" "heads" "heads" "tails"
##  [19] "tails" "heads" "tails" "heads" "tails" "heads" "heads" "tails" "tails"
##  [28] "heads" "heads" "heads" "tails" "tails" "tails" "heads" "heads" "heads"
##  [37] "tails" "heads" "heads" "tails" "tails" "heads" "heads" "tails" "heads"
##  [46] "heads" "tails" "tails" "heads" "tails" "tails" "heads" "tails" "tails"
##  [55] "heads" "heads" "heads" "tails" "heads" "heads" "heads" "tails" "heads"
##  [64] "tails" "heads" "heads" "heads" "heads" "tails" "heads" "heads" "tails"
##  [73] "heads" "tails" "heads" "heads" "tails" "tails" "heads" "heads" "tails"
##  [82] "heads" "heads" "tails" "tails" "heads" "heads" "heads" "tails" "tails"
##  [91] "tails" "tails" "heads" "tails" "tails" "tails" "tails" "tails" "tails"
## [100] "heads"
table(sim_fair_coin)
## sim_fair_coin
## heads tails 
##    54    46
sim_unfair_coin <- sample(coin_outcomes, size = 100, replace = TRUE, prob = c(0.2, 0.8))

##5.) In your simulation of flipping the unfair coin 100 times, how many flips came up heads? Include the code for sampling the unfair coin in your response. Since every time you generate a new simulation you may get different answers, you should also “set a seed” before you sample. Read more about setting a seed below.

set.seed(773952)
coin_outcomes <- c("heads","tails")
      sim_unfair_flip <- sample(coin_outcomes,size=100,replace = TRUE, prob = c(0.2,0.8))
      sim_unfair_flip
##   [1] "tails" "tails" "tails" "tails" "tails" "tails" "tails" "tails" "tails"
##  [10] "tails" "tails" "tails" "heads" "tails" "tails" "tails" "tails" "tails"
##  [19] "tails" "tails" "tails" "tails" "heads" "tails" "tails" "tails" "tails"
##  [28] "tails" "tails" "tails" "tails" "heads" "tails" "tails" "tails" "tails"
##  [37] "tails" "heads" "heads" "tails" "heads" "tails" "tails" "tails" "tails"
##  [46] "tails" "tails" "tails" "tails" "tails" "tails" "tails" "tails" "tails"
##  [55] "tails" "tails" "tails" "tails" "heads" "tails" "tails" "tails" "heads"
##  [64] "tails" "heads" "tails" "tails" "tails" "tails" "tails" "tails" "heads"
##  [73] "tails" "heads" "tails" "heads" "tails" "tails" "tails" "tails" "tails"
##  [82] "heads" "heads" "heads" "tails" "tails" "tails" "tails" "heads" "tails"
##  [91] "tails" "tails" "tails" "tails" "tails" "tails" "tails" "tails" "tails"
## [100] "tails"
        table(sim_unfair_flip)
## sim_unfair_flip
## heads tails 
##    16    84

##Upon flipping unfair coin 100 coins, 16 flips came out as heads while the remaining 84 flips are tails.

set.seed(0102) 
shot_outcomes <- c("H", "M")
sim_basket <- sample(shot_outcomes, size = 1, replace = TRUE)

##6. What change needs to be made to the sample function so that it reflects a shooting percentage of 45%? Make this adjustment, then run a simulation to sample 133 shots. Assign the output of this simulation to a new object called sim_basket.

shot_outcomes <- c("H","M")

sim_basket<-sample(shot_outcomes,size = 133, replace= TRUE,prob = c(0.45,0.55))
sim_basket
##   [1] "M" "H" "M" "M" "M" "H" "M" "H" "M" "H" "H" "H" "M" "H" "H" "H" "M" "H"
##  [19] "H" "H" "H" "M" "H" "M" "M" "M" "M" "M" "M" "M" "H" "M" "H" "H" "M" "M"
##  [37] "H" "H" "M" "H" "H" "M" "M" "M" "M" "M" "M" "M" "H" "M" "M" "M" "M" "H"
##  [55] "M" "H" "M" "H" "H" "H" "M" "M" "H" "M" "H" "M" "M" "H" "H" "H" "H" "H"
##  [73] "M" "M" "M" "M" "M" "H" "M" "H" "M" "M" "M" "M" "M" "M" "H" "H" "M" "M"
##  [91] "M" "H" "M" "M" "M" "H" "M" "M" "M" "H" "M" "H" "M" "M" "H" "M" "H" "M"
## [109] "M" "H" "M" "M" "M" "H" "M" "M" "M" "M" "M" "M" "M" "M" "H" "H" "M" "H"
## [127] "H" "H" "H" "H" "H" "M" "M"

##The output displays a shooting percentage of 45% after running 133 sample shots.
##7.) Using calc_streak, compute the streak lengths of sim_basket, and save the results in a data frame called sim_streak.

calc_streak <- function(x){
y <- rep(0,length(x))
y[x == "H"] <- 1
y <- c(0, y, 0)
wz <- which(y == 0)
streak <- diff(wz) - 1
return(streak)
}
sim_streak <- tibble(length = calc_streak(sim_basket))
sim_streak
## # A tibble: 80 x 1
##    length
##     <dbl>
##  1      0
##  2      1
##  3      0
##  4      0
##  5      1
##  6      1
##  7      3
##  8      3
##  9      4
## 10      1
## # ... with 70 more rows

##9.)If you were to run the simulation of the independent shooter a second time, how would you expect its streak distribution to compare to the distribution from the question above? Exactly the same?Somewhat similar? Totally different? Explain your reasoning.

sim_basket<-sample(shot_outcomes,size = 133, replace= TRUE,prob = c(0.45,0.55))
table(sim_basket)
## sim_basket
##  H  M 
## 48 85

##Since we are using sample() it generates different variables each time.The results after shooting the independengt shooter from the pass 1 and pass 2 are different values but with a little difference.

##10.). How does Kobe Bryant’s distribution of streak lengths compare to the distribution of streak lengths for the simulated shooter? Using this comparison, do you have evidence that the hot hand model fits Kobe’s shooting patterns? Explain.

streak_independent<-tibble(calc_streak(sim_basket))
streak_independent
## # A tibble: 86 x 1
##    `calc_streak(sim_basket)`
##                        <dbl>
##  1                         0
##  2                         0
##  3                         0
##  4                         0
##  5                         2
##  6                         0
##  7                         1
##  8                         0
##  9                         0
## 10                         0
## # ... with 76 more rows
ggplot(data= streak_independent, aes(x = factor(calc_streak(sim_basket)
), fill = factor(calc_streak(sim_basket)
))) + geom_bar() + geom_text(aes(label = ..count..), stat = "count", vjust = 0.0)+ ggtitle("Barplot showing the length of the streaks")+ labs( x = "length" , y = "Counts")

##We can see that kobe and shooter are almost same beacuse both are right skewed. We donot have additional information to show the difference between patterns.