This project is part of my dataquest course. The aim of the project is to interrogate the data from the American gameshow ‘Jeopardy’. The dataset includes every question, the question value, the answer, the date, round and show number. There are two aims: to see if some categories are statistically more likely to appear more often than others ; to see if certain terms in questions are statistically more likely to appear in high value questions. The chi-squared test will be used to answer these questions.

#load libraries

library (readr)
library(stringr)
library(ggplot2)
Registered S3 method overwritten by 'dplyr':
  method           from
  print.rowwise_df     
Want to understand how all the pieces fit together? Read R for Data Science:
https://r4ds.had.co.nz/
library(dplyr)

Attaching package: 㤼㸱dplyr㤼㸲

The following objects are masked from 㤼㸱package:stats㤼㸲:

    filter, lag

The following objects are masked from 㤼㸱package:base㤼㸲:

    intersect, setdiff, setequal, union
library(purrr)
library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
  method         from
  print.tbl_lazy     
  print.tbl_sql      
-- Attaching packages --------------------------------------- tidyverse 1.3.0 --
v tibble  3.0.1     v forcats 0.5.0
v tidyr   1.0.3     
-- Conflicts ------------------------------------------ tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag()    masks stats::lag()
#load data and clean column names

setwd("C:/Users/Ana/Desktop/Data Analytics/CSV Files")
data <- read_csv("jeopardy_2.csv")
Parsed with column specification:
cols(
  `Show Number` = col_double(),
  `Air Date` = col_character(),
  Round = col_character(),
  Category = col_character(),
  Value = col_character(),
  Question = col_character(),
  Answer = col_character()
)
colnames(data)
[1] "Show Number" "Air Date"    "Round"       "Category"    "Value"       "Question"    "Answer"     
colnames(data) <- tolower(colnames(data))
colnames(data)[1:2] <- c("show_number", "air_date")

#print first 5 rows of data
head(data, 5)
#print number of rows in dataframe
nrow(data)
[1] 19999
#print the number of unique categories
length(unique(data$category))
[1] 3580
#clean value column and make numeric

data_2 <- data %>%
  mutate(value = str_replace(value, "\\$", "")) %>%
  mutate(value = str_replace(value, "[Nn]one", "")) %>%
  mutate(value = as.numeric(value)) %>%
  drop_na(value)
NAs introduced by coercion
head(data_2,100)
#normalise the question, answer and category columns

normalise <- function(vector) {
  vector <- str_replace_all(vector, "[:punct:]", "")
  vector <- tolower(vector)
}

data_3 <- data_2 %>%
  mutate(category = normalise(category)) %>%
  mutate(question = normalise(question)) %>%
  mutate(answer = normalise(answer))

head(data_3)
#split the air_date column into three columns; day, month, year

data_4 <- data_3 %>%
  separate(air_date, c("day", "month", "year"), sep = "/") %>%
  mutate(day = as.numeric(day)) %>%
  mutate(month = as.numeric(month)) %>%
  mutate(year = as.numeric(year))

head(data_4, 100)
NA
science_cat <- data_4 %>%
  filter(category == "science")

sci_observed <- nrow(science_cat)

history_cat <- data_4 %>%
  filter(category == "history")

hist_observed <- nrow(history_cat)

shakespeare_cat <- data_4 %>%
  filter(category == "shakespeare")

ss_observed <- nrow(shakespeare_cat)

sci_observed
[1] 33
hist_observed
[1] 38
ss_observed
[1] 19
n_questions <- nrow(data_4)
p_category_expected <-   1/3369 
p_not_category_expected <- 3368/3369 
p_expected <- c(p_category_expected, p_not_category_expected)

science_obs <- c(sci_observed, n_questions - sci_observed)

chisq.test(science_obs, p = p_expected)

    Chi-squared test for given probabilities

data:  science_obs
X-squared = 133.76, df = 1, p-value < 2.2e-16

The p-value is < 0.05. I.e. it is extremely unlikely that this result would occur. Therefore we should reject the null hypothesis that there is no bias towards science questions.

history_obs <- c(hist_observed, n_questions - hist_observed)

chisq.test(history_obs, p = p_expected)

    Chi-squared test for given probabilities

data:  history_obs
X-squared = 187.05, df = 1, p-value < 2.2e-16

The p-value is < 0.05. I.e. it is extremely unlikely that this result would occur. Therefore we should reject the null hypothesis that there is no bias towards history questions.

shakespeare_obs <- c(ss_observed, n_questions - ss_observed)

chisq.test(shakespeare_obs, p = p_expected)

    Chi-squared test for given probabilities

data:  shakespeare_obs
X-squared = 31.966, df = 1, p-value = 1.569e-08

The p-value is < 0.05. I.e. it is extremely unlikely that this result would occur. Therefore we should reject the null hypothesis that there is no bias towards shakespeare questions.

#order data set by air date

data_5 <- data_4 %>%
  arrange(year, month, day)
longwords <- function(vector) {
  indv_words <- unlist(str_split(vector, "\\s+"))
  logic <- str_detect(indv_words, "\\w\\w\\w\\w\\w\\w+")
  long_words <- indv_words[logic]
  long_words
}
test_sentence <- "hellothere my name is Ana and I'm learning to code elephant swimming monsoon hurricane government predjudice fakenews"
#vector <- test_sentence
#indv_words <- unlist(str_split(vector, "\\s+"))
#logic <- str_detect(indv_words, "\\w\\w\\w\\w\\w\\w+")
#long_words <- indv_words[logic]
#long_words

longwords <- function(vector) {
  indv_words <- unlist(str_split(vector, "\\s+"))
  logic <- str_detect(indv_words, "\\w\\w\\w\\w\\w\\w+")
  long_words <- indv_words[logic]
  long_words
}

#longwords(test_sentence)
#count how many unique 6+letter words there are in the questions

length(terms_used)
[1] 23584
# filter the dataframe to and include a new column which categorises a question as high or low value

data_6 <- data_5 %>%
  select(value, question) %>%
  mutate(high_low = if_else(value < 800, "low", "high"))

head(data_6, 1000)

#check the proportion of high value and low value questions

data_6 %>% group_by(high_low) %>% summarise(Freq = n(), percent = n()/nrow(data_6)*100) 
NA
#create a function which takes a word and outputs the number of times it appears in a value question

wordcount <- 0

data_6_hv <- data_6 %>% filter(high_low == "high")
data_6_lv <- data_6 %>% filter(high_low == "low")

question_split_and_count <- function(question, word) {
  
  q_long_words <- longwords(question) #vector of long words in each question
  present <- if_else(word %in% q_long_words, 1, 0) # check if word appears in vector
  wordcount <- wordcount + present
}

#run this funtion for high value questions. Test it on the word "british"

sum(unlist(map2(data_6_hv$question, "british", question_split_and_count)))
[1] 69
#run this funtion for low value questions. Test it on the word "british"

sum(unlist(map2(data_6_lv$question, "british", question_split_and_count)))
[1] 83
#iterate through the words of the terms_used vector for the high value questions
#As the term_used vector has 20,000+ words, I don't want to interate through it all, so need to find a way to reduce the number of terms.
#Firstly, when looking at hypothesis testing, you need a minimum of 5 occurances. Lots of these words will likely only occur once or twice throughout the whole dataset so it would be good to get rid of those words
all_words <- paste(data_6$question, collapse = " ")
#create a function which counts the number of times a word appears in all_words

occurances <- function(word){
  searchword <- paste(" ", word, " ", sep="")
  str_count(all_words, searchword)
}

words_appear <- unlist(map(terms_used, occurances))
#We now have vector which contains the number of occurances per word of the terms_used vector. bind this vectors to create a dataframe. 

unique_terms <- as.data.frame(cbind(terms_used, words_appear))

#We only want to include words that have enough appearances, otherwise the chi-squared function won't be reliable. We know that if there are less than 5 instances, then the chi-quared test won't be reliable. Therefore, filter to only keep words with over 5 appearances.

unique_terms_2 <- unique_terms %>%
  mutate(words_appear = as.numeric(words_appear)) %>%
  filter(words_appear > 5) %>%
  arrange(desc(words_appear))

head(unique_terms_2, 100)

#print number of unique terms
nrow(unique_terms_2)
[1] 2616
#now for the analysis, take the top 20 most used terms in questions

popular_terms_used <- unique_terms_2$terms_used[1:20]

#print populat_terms_used vector
popular_terms_used
 [1] "called"               "country"              "played"               "became"              
 [5] "before"               "american"             "capital"              "target=blank>here<a>"
 [9] "french"               "famous"               "president"            "island"              
[13] "little"               "national"             "largest"              "people"              
[17] "author"               "around"               "british"              "meaning"             

#now iterate through the words of the popular_terms_used vector for the high value questions

#test_terms <- c("britain", "america", "canada")

hv_freq <- c()

for(word in popular_terms_used) {
  hv_freq <- c(hv_freq, sum(unlist(map2(data_6_hv$question, word, question_split_and_count))))
}

hv_freq

#iterate through the words of the terms_used vector for the low value questions

lv_freq <- c()

for(word in popular_terms_used) {
  lv_freq <- c(lv_freq, sum(unlist(map2(data_6_lv$question, word, question_split_and_count))))
}

lv_freq
#bind the popular_terms_used vector to the frequencies for high and low value questions

pop_word_freq <- as.data.frame(cbind(popular_terms_used, hv_freq, lv_freq))

#as we know that there are 3 low value questions for every 2 high value questions, calculate the expected probability

p_hv_expected <- 0.4 
p_lv_expected <- 0.6 
p_expected <- c(p_hv_expected, p_lv_expected)

#manipulate the dataframe to include rows for the proportion of each word appearring in high and low value questions. also calculate the p-value for each word.

pop_word_freq_2 <- pop_word_freq %>%
  mutate(hv_freq = as.numeric(hv_freq),
         lv_freq = as.numeric(lv_freq),
         total = (hv_freq + lv_freq),
         prop_hv = hv_freq/total,
         prop_lv = lv_freq/total,
         n = (prop_hv - p_hv_expected)^2/p_hv_expected + (prop_lv - p_lv_expected)^2/p_lv_expected,
         pvalue = 1 - pchisq(n, df = 1)) %>%
  filter(total > 10)

pop_word_freq_2

As none of the p-values were less than 0.05 we cannot reject the null hypothesis. i.e. there is nothing to support any of these terms showing up more in high or low value questions. Disappointing!

---
title: "Popular Categories and Key Words in the American gameshow 'Jeopardy'"
output: html_notebook
---

This project is part of my dataquest course. The aim of the project is to interrogate the data from the American gameshow 'Jeopardy'. The dataset includes every question, the question value, the answer, the date, round and show number. There are two aims: to see if some categories are statistically more likely to appear more often than others ; to see if certain terms in questions are statistically more likely to appear in high value questions. The chi-squared test will be used to answer these questions. 

```{r}
#load libraries

library (readr)
library(stringr)
library(ggplot2)
library(dplyr)
library(purrr)
library(tidyverse)
```

```{r}
#load data and clean column names

setwd("C:/Users/Ana/Desktop/Data Analytics/CSV Files")
data <- read_csv("jeopardy_2.csv")
colnames(data)
colnames(data) <- tolower(colnames(data))
colnames(data)[1:2] <- c("show_number", "air_date")

#print first 5 rows of data
head(data, 5)
#print number of rows in dataframe
nrow(data)
#print the number of unique categories
length(unique(data$category))
```
```{r}
#clean value column and make numeric

data_2 <- data %>%
  mutate(value = str_replace(value, "\\$", "")) %>%
  mutate(value = str_replace(value, "[Nn]one", "")) %>%
  mutate(value = as.numeric(value)) %>%
  drop_na(value)

head(data_2,100)
```

```{r}
#normalise the question, answer and category columns

normalise <- function(vector) {
  vector <- str_replace_all(vector, "[:punct:]", "")
  vector <- tolower(vector)
}

data_3 <- data_2 %>%
  mutate(category = normalise(category)) %>%
  mutate(question = normalise(question)) %>%
  mutate(answer = normalise(answer))

head(data_3)
```

```{r}
#split the air_date column into three columns; day, month, year

data_4 <- data_3 %>%
  separate(air_date, c("day", "month", "year"), sep = "/") %>%
  mutate(day = as.numeric(day)) %>%
  mutate(month = as.numeric(month)) %>%
  mutate(year = as.numeric(year))

head(data_4, 100)

```


```{r}
#count how often the categories 'science', 'history' and 'shakespeare' occur. We want to see if these categories are more likely to occur than other categories. 
#Null hypothesis: science/history/shakespeare categories are equally likely to occur as any other category

science_cat <- data_4 %>%
  filter(category == "science")

sci_observed <- nrow(science_cat)

history_cat <- data_4 %>%
  filter(category == "history")

hist_observed <- nrow(history_cat)

shakespeare_cat <- data_4 %>%
  filter(category == "shakespeare")

ss_observed <- nrow(shakespeare_cat)

sci_observed
hist_observed
ss_observed

```

```{r}
#as there are 3369 unqiue categories, calculate the expected probability for a particular category to appear

n_questions <- nrow(data_4)
p_category_expected <-   1/3369 
p_not_category_expected <- 3368/3369 
p_expected <- c(p_category_expected, p_not_category_expected)

#for 'science', run a chi-squared test to see if we can reject or accept the null hypothesis. 

science_obs <- c(sci_observed, n_questions - sci_observed)

chisq.test(science_obs, p = p_expected)
```
The p-value is < 0.05. I.e. it is extremely unlikely that this result would occur. Therefore we should reject the null hypothesis that there is no bias towards science questions.
```{r}
history_obs <- c(hist_observed, n_questions - hist_observed)

chisq.test(history_obs, p = p_expected)
```
The p-value is < 0.05. I.e. it is extremely unlikely that this result would occur. Therefore we should reject the null hypothesis that there is no bias towards history questions.
```{r}
shakespeare_obs <- c(ss_observed, n_questions - ss_observed)

chisq.test(shakespeare_obs, p = p_expected)
```
The p-value is < 0.05. I.e. it is extremely unlikely that this result would occur. Therefore we should reject the null hypothesis that there is no bias towards shakespeare questions.
```{r}
#order data set by air date

data_5 <- data_4 %>%
  arrange(year, month, day)

```

```{r}
#create a function that takes an input string and outputs a vector of all the 6+ letter words that appear in the input string.

longwords <- function(vector) {
  indv_words <- unlist(str_split(vector, "\\s+"))
  logic <- str_detect(indv_words, "\\w\\w\\w\\w\\w\\w+")
  long_words <- indv_words[logic]
  long_words
}

```

```{r}
#map the longwords function to the question column to create a vector of all unique long (6 letters +) words present in the questions

terms_used <- unique(unlist(map(data_5$question, longwords)))

head(terms_used, 10)


```
```{r}
#count how many unique 6+letter words there are in the questions

length(terms_used)
```

```{r}
# filter the dataframe to and include a new column which categorises a question as high or low value

data_6 <- data_5 %>%
  select(value, question) %>%
  mutate(high_low = if_else(value < 800, "low", "high"))

head(data_6, 1000)

#check the proportion of high value and low value questions

data_6 %>% group_by(high_low) %>% summarise(Freq = n(), percent = n()/nrow(data_6)*100) 

```


```{r}

#split the dataframe into 2 - 1 for high value questions, the other for low value questions

data_6_hv <- data_6 %>% filter(high_low == "high")
data_6_lv <- data_6 %>% filter(high_low == "low")

#create a function which takes a word and outputs the number of times it appears in a high and low value question

wordcount <- 0

question_split_and_count <- function(question, word) {
  
  q_long_words <- longwords(question) #vector of long words in each question
  present <- if_else(word %in% q_long_words, 1, 0) # check if word appears in vector
  wordcount <- wordcount + present
}

#run this funtion for high value questions. Test it on the word "british"

sum(unlist(map2(data_6_hv$question, "british", question_split_and_count)))

#run this funtion for low value questions. Test it on the word "british"

sum(unlist(map2(data_6_lv$question, "british", question_split_and_count)))
```


```{r}
#iterate through the words of the terms_used vector for the high value questions
#As the term_used vector has 20,000+ words, I don't want to interate through it all, so need to find a way to reduce the number of terms.
#Firstly, when looking at hypothesis testing, you need a minimum of 5 occurances. Lots of these words will likely only occur once or twice throughout the whole dataset so it would be good to get rid of those words

```


```{r}
#first, take the terms_used vector and use paste to create one long string containing every word. call this all_words. 

all_words <- paste(data_6$question, collapse = " ")

```


```{r}
#create a function which counts the number of times a word appears in all_words

occurances <- function(word){
  searchword <- paste(" ", word, " ", sep="")
  str_count(all_words, searchword)
}

words_appear <- unlist(map(terms_used, occurances))

```


```{r}
#We now have vector which contains the number of occurances per word of the terms_used vector. bind this vectors to create a dataframe. 

unique_terms <- as.data.frame(cbind(terms_used, words_appear))

#We only want to include words that have enough appearances, otherwise the chi-squared function won't be reliable. We know that if there are less than 5 instances, then the chi-quared test won't be reliable. Therefore, filter to only keep words with over 5 appearances.

unique_terms_2 <- unique_terms %>%
  mutate(words_appear = as.numeric(words_appear)) %>%
  filter(words_appear > 5) %>%
  arrange(desc(words_appear))

head(unique_terms_2, 100)

#print number of unique terms
nrow(unique_terms_2)

#now for the analysis, take the top 20 most used terms in questions

popular_terms_used <- unique_terms_2$terms_used[1:20]

#print populat_terms_used vector
popular_terms_used

```

```{r}

#now iterate through the words of the popular_terms_used vector for the high value questions

#test_terms <- c("britain", "america", "canada")

hv_freq <- c()

for(word in popular_terms_used) {
  hv_freq <- c(hv_freq, sum(unlist(map2(data_6_hv$question, word, question_split_and_count))))
}

hv_freq

```

```{r}

#iterate through the words of the terms_used vector for the low value questions

lv_freq <- c()

for(word in popular_terms_used) {
  lv_freq <- c(lv_freq, sum(unlist(map2(data_6_lv$question, word, question_split_and_count))))
}

lv_freq
```
```{r}
#bind the popular_terms_used vector to the frequencies for high and low value questions

pop_word_freq <- as.data.frame(cbind(popular_terms_used, hv_freq, lv_freq))

#as we know that there are 3 low value questions for every 2 high value questions, calculate the expected probability

p_hv_expected <- 0.4 
p_lv_expected <- 0.6 
p_expected <- c(p_hv_expected, p_lv_expected)

#manipulate the dataframe to include rows for the proportion of each word appearring in high and low value questions. also calculate the p-value for each word.

pop_word_freq_2 <- pop_word_freq %>%
  mutate(hv_freq = as.numeric(hv_freq),
         lv_freq = as.numeric(lv_freq),
         total = (hv_freq + lv_freq),
         prop_hv = hv_freq/total,
         prop_lv = lv_freq/total,
         n = (prop_hv - p_hv_expected)^2/p_hv_expected + (prop_lv - p_lv_expected)^2/p_lv_expected,
         pvalue = 1 - pchisq(n, df = 1)) %>%
  filter(total > 10)

pop_word_freq_2

```
As none of the p-values were less than 0.05 we cannot reject the null hypothesis. i.e. there is nothing to support any of these terms showing up more in high or low value questions. Disappointing!


