QUARTO WORKBOOK

Author

Ciarra Rae Mooney

Data Analysis (Week 3)

6.6.1 Exercises

library(tidyverse)
library(readr)
library(knitr)

Problem A

library(tidyverse)
View(midwest)
library(ggplot2)
data("midwest")

midwest %>% # utilizes the midwest dataset
  group_by(state) %>%  # groups data vy the state variable
  summarize(poptotalmean = mean(poptotal), # summarizes the state data set further into: the average of the total pop total/ population total
            poptotalmed = median(poptotal), # :the median of the poptotal within the state group
            popmax = max(poptotal), # the max poptotal of each state
            popmin = min(poptotal), # the min poptotal of each state
            popdistinct = n_distinct(poptotal), # counts the different no. of distinct values in each state
            popfirst = first(poptotal), # extracts the first value of each state
            popany = any(poptotal < 5000), # shows which states have a poptotal less than 5000, this is depicted as 'TRUE'
            popany2 = any(poptotal > 2000000)) %>% # shows which states have a poptotal greater than 2000000
  ungroup() # data is no longer grouped by state

# A tibble: 5 × 9
  state poptotalmean poptotalmed  popmax popmin popdistinct popfirst popany
  <chr>        <dbl>       <dbl>   <int>  <int>       <int>    <int> <lgl> 
1 IL         112065.      24486. 5105067   4373         101    66090 TRUE  
2 IN          60263.      30362.  797159   5315          92    31095 FALSE 
3 MI         111992.      37308  2111687   1701          83    10145 TRUE  
4 OH         123263.      54930. 1412140  11098          88    25371 FALSE 
5 WI          67941.      33528   959275   3890          72    15682 TRUE  
# ℹ 1 more variable: popany2 <lgl>

Problem B

midwest %>%
  group_by(state) %>%
  summarize(num5k = sum(poptotal < 5000), # summarizes by adding all the states with less than 5000 people in their poptotal
            num2mil = sum(poptotal > 2000000), # summarizes by adding all the states with more than 2000000 people in their poptotal
            numrows = n()) %>% # 'n' gives you counts, tells you how many rows are in each state - helps with accuracy
  ungroup()

# A tibble: 5 × 4
  state num5k num2mil numrows
  <chr> <int>   <int>   <int>
1 IL        1       1     102
2 IN        0       0      92
3 MI        1       1      83
4 OH        0       0      88
5 WI        2       0      72

Problem C

Part I

midwest %>%
  group_by(county) %>% # grouped according to county
  summarize (x = n_distinct(state)) %>% # summarized by counting the number of distinct states in each county, labeling this category as 'x'
  arrange(desc(x)) %>% # then arranges the values listed above (x) in descending order from highest to lowest no. of states in each county
  ungroup ()

# A tibble: 320 × 2
   county         x
   <chr>      <int>
 1 CRAWFORD       5
 2 JACKSON        5
 3 MONROE         5
 4 ADAMS          4
 5 BROWN          4
 6 CLARK          4
 7 CLINTON        4
 8 JEFFERSON      4
 9 LAKE           4
10 WASHINGTON     4
# ℹ 310 more rows

Part II

Q: How does n() differ from n_distinct()?

A: n() counts the number of rows or observations within a group or across the entire dataset.n() is used inside summarize() or mutate() functions, typically when you’ve grouped the data with group_by().n() returns the total count of rows for each group (or the entire dataset if no groups are specified). In contrast, n_distinct() Counts the number of distinct (unique) values in a vector or column.n_distinct() is typically used to count unique values within a group or across the entire dataset.It returns the number of distinct values within a column or across multiple columns.

Q: When would they be the same? different?

A: They would be the same when there is only one unique value in the column being evaluated, meaning all rows in that group have the same value and they would be different when there are multiple unique values in the column being evaluated within the group.

midwest %>%
  group_by (county) %>%
  summarize (x = n()) %>% # counts the no. of rows within a county
  ungroup()

# A tibble: 320 × 2
   county        x
   <chr>     <int>
 1 ADAMS         4
 2 ALCONA        1
 3 ALEXANDER     1
 4 ALGER         1
 5 ALLEGAN       1
 6 ALLEN         2
 7 ALPENA        1
 8 ANTRIM        1
 9 ARENAC        1
10 ASHLAND       2
# ℹ 310 more rows

Part III

midwest %>%
  group_by (county) %>%
  summarize (x = n_distinct(county)) %>% # summarizes the number of each distinct county within a county, this makes sense by the answers for x all = 1 - there cannot be more than 1 county in each county. It is the county itself!
  ungroup ()

# A tibble: 320 × 2
   county        x
   <chr>     <int>
 1 ADAMS         1
 2 ALCONA        1
 3 ALEXANDER     1
 4 ALGER         1
 5 ALLEGAN       1
 6 ALLEN         1
 7 ALPENA        1
 8 ANTRIM        1
 9 ARENAC        1
10 ASHLAND       1
# ℹ 310 more rows

midwest %>%
  group_by (county) %>%
  summarize (x = n_distinct(state)) %>% # you can in fact have multiple states in 1 county, so this makes sense to have values greater than 1
  ungroup ()

# A tibble: 320 × 2
   county        x
   <chr>     <int>
 1 ADAMS         4
 2 ALCONA        1
 3 ALEXANDER     1
 4 ALGER         1
 5 ALLEGAN       1
 6 ALLEN         2
 7 ALPENA        1
 8 ANTRIM        1
 9 ARENAC        1
10 ASHLAND       2
# ℹ 310 more rows

Problem D

diamonds %>% # utilizes the diamonds dataset
  group_by(clarity) %>% # grouping according to the diamond clarity variable
  summarize(a = n_distinct(color), # summarizing the number of distinct colors within each clarity group, labeled under 'a'
            b = n_distinct(price), # summarizing the number of distinct prices within each clarity group, labeled under 'b'
            c = n()) %>% # counts the number of rows within each clarity group for accuracy purposes
  ungroup()

# A tibble: 8 × 4
  clarity     a     b     c
  <ord>   <int> <int> <int>
1 I1          7   632   741
2 SI2         7  4904  9194
3 SI1         7  5380 13065
4 VS2         7  5051 12258
5 VS1         7  3926  8171
6 VVS2        7  2409  5066
7 VVS1        7  1623  3655
8 IF          7   902  1790

Problem E

Part I

diamonds %>%
  group_by(color, cut) %>% # grouped according to the color and then cut variables
  summarize(m = mean(price), # summaries are provided for the average prices of diamonds within specific colour and cut combinations, labeled 'm'
            s = sd(price)) %>% # summarizes are provided for the standard deviations of prices of diamonds within specific colour and cut combinations, labeled 's'
  ungroup()

`summarise()` has grouped output by 'color'. You can override using the
`.groups` argument.

# A tibble: 35 × 4
   color cut           m     s
   <ord> <ord>     <dbl> <dbl>
 1 D     Fair      4291. 3286.
 2 D     Good      3405. 3175.
 3 D     Very Good 3470. 3524.
 4 D     Premium   3631. 3712.
 5 D     Ideal     2629. 3001.
 6 E     Fair      3682. 2977.
 7 E     Good      3424. 3331.
 8 E     Very Good 3215. 3408.
 9 E     Premium   3539. 3795.
10 E     Ideal     2598. 2956.
# ℹ 25 more rows

Part II

diamonds %>%
  group_by(cut, color) %>% # grouped according to the cut and then color variables
  summarize(m = mean(price), # summaries are provided for the average prices of diamonds within specific colour and cut combinations, labeled 'm'
            s = sd(price)) %>% # summarizes are provided for the standard deviations of prices of diamonds within specific colour and cut combinations, labeled 's'
  ungroup()

`summarise()` has grouped output by 'cut'. You can override using the `.groups`
argument.

# A tibble: 35 × 4
   cut   color     m     s
   <ord> <ord> <dbl> <dbl>
 1 Fair  D     4291. 3286.
 2 Fair  E     3682. 2977.
 3 Fair  F     3827. 3223.
 4 Fair  G     4239. 3610.
 5 Fair  H     5136. 3886.
 6 Fair  I     4685. 3730.
 7 Fair  J     4976. 4050.
 8 Good  D     3405. 3175.
 9 Good  E     3424. 3331.
10 Good  F     3496. 3202.
# ℹ 25 more rows

Part III

diamonds %>% 
  group_by(cut, color, clarity) %>% # grouped according to cut, then color, then clarity 
  summarize(m = mean(price),
            s = sd(price),
            msale = m * 0.80) %>% # summarizes the sale price for each combination group, which is 0.8 the original average price per combination group. I.e. there will be a loss as the sales prices is less than the average price of the diamond with these qualities. If the msale and price was the same no profits would be realized (breaking even).
  ungroup()

`summarise()` has grouped output by 'cut', 'color'. You can override using the
`.groups` argument.

# A tibble: 276 × 6
   cut   color clarity     m     s msale
   <ord> <ord> <ord>   <dbl> <dbl> <dbl>
 1 Fair  D     I1      7383  5899. 5906.
 2 Fair  D     SI2     4355. 3260. 3484.
 3 Fair  D     SI1     4273. 3019. 3419.
 4 Fair  D     VS2     4513. 3383. 3610.
 5 Fair  D     VS1     2921. 2550. 2337.
 6 Fair  D     VVS2    3607  3629. 2886.
 7 Fair  D     VVS1    4473  5457. 3578.
 8 Fair  D     IF      1620.  525. 1296.
 9 Fair  E     I1      2095.  824. 1676.
10 Fair  E     SI2     4172. 3055. 3338.
# ℹ 266 more rows

Problem F

diamonds %>% 
  group_by(cut) %>% 
  summarize(potato = mean(depth), # Average depth per cut
            pizza = mean(price),# Average price per cut
            popcorn = median(y), # Median values of y per cut
            pineapple = potato - pizza, # Ave depth per cut - ave price per cut
            papaya = pineapple ^ 2, # Square root of all values above
            peach = n()) %>%  # total no. of rows per cut
  ungroup()

# A tibble: 5 × 7
  cut       potato pizza popcorn pineapple    papaya peach
  <ord>      <dbl> <dbl>   <dbl>     <dbl>     <dbl> <int>
1 Fair        64.0 4359.    6.1     -4295. 18444586.  1610
2 Good        62.4 3929.    5.99    -3866. 14949811.  4906
3 Very Good   61.8 3982.    5.77    -3920. 15365942. 12082
4 Premium     61.3 4584.    6.06    -4523. 20457466. 13791
5 Ideal       61.7 3458.    5.26    -3396. 11531679. 21551

Problem G

Part I

diamonds %>%
  group_by(color) %>% # Grouping according to the color variable
  summarize (m = mean(price)) %>% # Average price per color
  mutate (x1 = str_c("Diamond color", color), # Building a string by combining the label "Diamond color" to the color of the diamond - forming a single string, adding a new colum called x1
          x2 = 5) %>% # making all x2 values equate to 5, making a new column called x2
  ungroup()

# A tibble: 7 × 4
  color     m x1                x2
  <ord> <dbl> <chr>          <dbl>
1 D     3170. Diamond colorD     5
2 E     3077. Diamond colorE     5
3 F     3725. Diamond colorF     5
4 G     3999. Diamond colorG     5
5 H     4487. Diamond colorH     5
6 I     5092. Diamond colorI     5
7 J     5324. Diamond colorJ     5

Part II

diamonds %>%
  group_by(color) %>% 
  summarize (m = mean(price)) %>% 
  mutate (x1 = str_c("Diamond color", color),
          x2 = 5)

# A tibble: 7 × 4
  color     m x1                x2
  <ord> <dbl> <chr>          <dbl>
1 D     3170. Diamond colorD     5
2 E     3077. Diamond colorE     5
3 F     3725. Diamond colorF     5
4 G     3999. Diamond colorG     5
5 H     4487. Diamond colorH     5
6 I     5092. Diamond colorI     5
7 J     5324. Diamond colorJ     5

Q: What does the first ungroup() do? Is it useful here? Why/ why not?

A: The ungroup() function is used to remove grouping from a grouped data frame (or tibble). When you apply group_by() to a data frame, it creates a grouped data frame, which allows you to perform operations that are specific to each group. After performing those operations, you might want to return to a regular (ungrouped) data frame for further analysis or to apply functions that should not be performed on a grouped basis. The primary purpose of ungroup is to strip away the grouping structure of a data frame, allowing subsequent operations to treat the data as a whole rather than in separate groups.

Q: Why isn’t there a closing ungroup() after the mutate()?

It may not be necessary if you intend to keep the group context for further operations or if your final result does not require it. It’s a good practice to ungroup when you want to ensure subsequent operations are performed on the entire dataset rather than within the confines of any existing groupings.In essence, it all boils down to how you want to manage the grouping of your data through the analysis pipeline. If you do not explicitly call ungroup(), it will remain grouped until you do so or until the end of the pipeline, where the grouping is dropped automatically.

Problem H

Part I

diamonds %>%
  group_by(color) %>% 
  mutate(x1 = price * 0.5) %>% # adding a new column in the dataset that takes the prices of the various color groups multiplied by 0.5
  summarize (m = mean(x1)) %>% # calculating the mean of the above values per colour group, and labeling it 'm'
  ungroup()

# A tibble: 7 × 2
  color     m
  <ord> <dbl>
1 D     1585.
2 E     1538.
3 F     1862.
4 G     2000.
5 H     2243.
6 I     2546.
7 J     2662.

Part II

diamonds %>% 
  group_by(color) %>% 
  mutate(x1 = price * 0.5) %>% 
  ungroup() %>%  
  summarize(m = mean(x1))

# A tibble: 1 × 1
      m
  <dbl>
1 1966.

Q: Whats the difference between part I and II?

A: The data has been ungrouped directly after the mutation, so now the average of all the groups has simply been calculated instead of a specific group category.

2) Why is grouping data necessary?

Grouping data is essential in data analysis as it allows for the aggregation of information, enabling the computation of summary statistics such as means and medians for different categories within a dataset. This process facilitates comparison across groups, helping to identify patterns, trends, and anomalies that may not be apparent when examining the data as a whole. By reducing the complexity of large datasets, grouping makes the information more manageable and easier to visualize, leading to clearer insights and more effective communication of findings. Additionally, many statistical analyses require data to be grouped, as this helps determine if observed differences between groups are statistically significant. Grouping also aids in time series analysis, allowing for the observation of trends over time, and is instrumental in data quality checks, helping to identify outliers and anomalies. Ultimately, grouping data supports informed decision-making by providing a clearer understanding of various segments, guiding strategies, marketing efforts, and resource allocation in business contexts.

3) Why is ungrouping data necessary?

Ungrouping data is a critical step in data analysis as it allows analysts to return to the original, unaggregated state of the dataset for more detailed exploration and interpretation. Once data has been grouped and summarized, certain nuances and individual-level details may be lost, which can hinder a comprehensive understanding of the underlying patterns and relationships. Ungrouping enables analysts to re-examine the full dataset, facilitating more granular analyses and the ability to apply different statistical methods or transformations that may not be applicable to grouped data. Additionally, ungrouping can be essential for visualizing individual data points, providing richer insights through scatter plots or other visualizations that require access to the original values. This step is crucial for verifying results, conducting further analyses, and ensuring that any conclusions drawn from aggregated data are well-supported by the complete dataset. Ultimately, ungrouping data fosters a more thorough investigation, helping to maintain the integrity and depth of the analysis.

4) When should you ungroup data?

Ungrouping data is essential in various scenarios during data analysis. It becomes necessary when a detailed examination of individual observations is required, as aggregated data can obscure critical patterns and relationships. If specific statistical methods, such as regression analysis or machine learning algorithms, are to be applied, ungrouping is crucial, as these techniques often require access to individual-level data. Additionally, for creating visualizations that depict individual data points—like scatter plots—ungrouping allows for a richer representation of the data distribution and helps identify outliers. Moreover, after analyzing grouped data, ungrouping is vital for verifying results, ensuring that patterns observed in aggregates are consistent with the original dataset. This process also aids in data cleaning and quality checks, allowing analysts to spot duplicates or anomalies that may not be visible in summaries. When comparing different groups at an individual level or preparing data for further transformations, ungrouping is necessary to maintain accuracy. Ultimately, ungrouping data fosters a more comprehensive exploration and understanding of the dataset, supporting informed decision-making and accurate conclusions.

6.7 Extra Practice

# Q1:
View(diamonds)

#Q2:
diamonds_low_high <- diamonds %>% # renaming tables for order
  arrange(price)

diamonds_high_low <- diamonds %>%
  arrange(desc(price))

diamonds_price_cut <- diamonds %>%
  arrange(price,
          cut)

diamonds_descprice_cut <- diamonds %>%
  arrange(-price,
          cut)

#Q3:
diamonds_ordered <- diamonds %>%
  arrange(price, clarity)

#Q4:
diamonds_with_sale <- diamonds %>%
  mutate(salePrice = price - 250)

#Q5:
diamonds_cleaned <- diamonds %>%
  select(-x, -y, -z)

#Q6:
diamonds_count <- diamonds %>%
  group_by(cut) %>%
  summarize(num_diamonds = n())

Research Methods

Good Question Based on Diamonds Dataset: “How does the average price of diamonds vary by cut quality, and what is the impact of carat weight on this relationship?”

Bad Question Based on the Diamonds Dataset: “Why are diamonds expensive?”

Pre-Session Exercises (Week 4)

Creating a Scatter Plot

#2.1.2
View(mtcars)
plot(mtcars$wt, mtcars$mpg)  # you can get a similar result as plot() using ggplot2()

library(ggplot2) # loading the ggplot2 library package
ggplot(mtcars, aes(x=wt, y=mpg)) + # tells it to create a plot object
  geom_point() # tells it to add a layer of points to the plot

ggplot(data = NULL, aes(x = mtcars$wt, y = mtcars$mpg)) +
  geom_point() # want to pass it 2 vectors for x and y values, use data = NULL

Creating a Line Graph

#2.2.2
plot(pressure$temperature, pressure$pressure, type = "l")

ggplot(pressure, aes(x = temperature, y = pressure)) +
  geom_line()

ggplot(pressure, aes(x = temperature, y = pressure)) +
  geom_line() +
  geom_point() # adds points to the line graph

Creating a Bar Graph

#2.3.2

barplot(BOD$demand, names.arg = BOD$Time)

table(mtcars$cyl) # there are 11 cases of the value 4, 7 cases of 6, 14 cases of 8


 4  6  8 
11  7 14

barplot(table(mtcars$cyl)) # generate a table of counts

ggplot(BOD, aes(x = Time, y = demand)) + # Bar graph of values, this uses the BOD data frame, with the "Time" column for x values and the "demand" column for y values
  geom_col()

ggplot(BOD, aes(x = factor(Time), y = demand)) + # convert the x variable to a factor, so it is treated as discrete
  geom_col()

ggplot(mtcars, aes(x = cyl)) + # bar graph of counts with "cyl" column for x position. the y position is calculated by counting the no. of rows for each value of cyl
  geom_bar()

ggplot(mtcars, aes(x = factor(cyl))) +
  geom_bar()

Creating a Histogram

#2.4.1 

hist((mtcars$mpg))

hist(mtcars$mpg, breaks = 10) # specify appropximate no. of bins with breaks

ggplot(mtcars, aes(x = mpg)) + # stat_bin() using bins = 30. pick better value with 'binwdith'
  geom_histogram()

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(mtcars, aes(x = mpg)) +
  geom_histogram(binwidth = 4) # with wider bins

Creating a Box Plot

#2.5.1
boxplot(len ~ supp, data = ToothGrowth) # formula syntax

boxplot(len ~ supp + dose, data = ToothGrowth) # put interaction of 2 variables on x-axis

ggplot(ToothGrowth, aes(x = supp, y = len)) +
  geom_boxplot()

ggplot(ToothGrowth, aes(x = interaction(supp, dose), y = len)) + #possible to make box plots for multiple variables, by combining the variables wit interaction()
  geom_boxplot()

Post-Session (Week 4)

#install.packages("modeldata")
library(modeldata)

View(crickets)

# The Basics: Scatter plot 
ggplot(crickets, aes(x = temp,
                     y = rate)) +
                     
  geom_point() + 
  labs(x = "Temperature",
       y = "Chirp Rate",
       colour = "Species",
       title = "Cricket Chirps",
       caption = "Source: McDonald (2009)")

ggplot(crickets, aes(x = temp,
                     y = rate,
                     colour = species)) + 
  geom_point() + 
  labs(x = "Temperature",
       y = "Chirp Rate",
       colour = "Species",
       title = "Cricket Chirps",
       caption = "Source: McDonald (2009)")

# Modifiying basic properties of the plot
ggplot(crickets, aes(x = temp,
                     y = rate)) + 
  geom_point(color = "red",
             size = 2, # size of points
             alpha = .3,
             shape = "square") + # change the transparency, helps w over plotting
  labs(x = "Temperature",
       y = "Chirp Rate",
       colour = "Species",
       title = "Cricket Chirps",
       caption = "Source: McDonald (2009)")

# Learn more about the options for the geom's
# with ?geom_point

# Adding another layer

ggplot(crickets, aes(x = temp,
                     y = rate)) +
                     
  geom_point() + 
  geom_smooth(method = "lm",
              se = FALSE) + # removes grey area
  labs(x = "Temperature",
       y = "Chirp Rate",
       colour = "Species",
       title = "Cricket Chirps",
       caption = "Source: McDonald (2009)")

`geom_smooth()` using formula = 'y ~ x'

# The Basics: Scatter plot 
ggplot(crickets, aes(x = temp,
                     y = rate,
                     colour = species)) +
                     
  geom_point() + 
  geom_smooth(method = "lm",
              se = FALSE) + 
  labs(x = "Temperature",
       y = "Chirp Rate",
       colour = "Species",
       title = "Cricket Chirps",
       caption = "Source: McDonald (2009)")

`geom_smooth()` using formula = 'y ~ x'

# Other plots:

# Histogram
ggplot(crickets, aes(x = rate)) + #frequencies/ counts
  geom_histogram(bins = 15) # one quantitative variable

# Frequency polygon
ggplot(crickets, aes(x = rate)) +
  geom_freqpoly(bins = 15) # same plot just represented in a different way

# Bar graph
ggplot(crickets, aes(x = species)) + # single categorical
  geom_bar()

ggplot(crickets, aes(x = species)) +
  geom_bar(colour = "darkblue",
           fill = "lightblue")

ggplot(crickets, aes(x = species,
                     fill = species)) +
  geom_bar(show.legend = FALSE)

# Box Plot 
ggplot(crickets, aes(x = species, # one quantitative and one categorical
                     y = rate,
                     colour = species)) +
  geom_boxplot(show.legend = FALSE) + 
  theme_minimal()

?theme_minimal

# Faceting

ggplot(crickets, aes(x=rate,
                     fill = species)) +
  geom_histogram(bins = 15) +
  facet_wrap(~species) # wrap it by species

?facet_wrap

Research Methods (Week 4)

Q: What is a good research hypothesis?

A: A good research hypothesis is a clear, testable statement that predicts an outcome based on theoretical background or prior research. It should be specific, measurable, and written in a way that can be either supported or refuted through experimentation or observation. A strong hypothesis often follows an “if-then” structure, linking independent and dependent variables, and it should be grounded in existing knowledge while leaving room for empirical investigation. Additionally, it should be falsifiable, meaning that there is a possibility to disprove it through evidence.

Formative Exercise (Week 5)

Deciding which data visualization to use

Box Plot:

# Load the iris dataset
data(iris)

# Box plot
boxplot(Sepal.Length ~ Species, data = iris, main = "Sepal Length by Species",
        xlab = "Species", ylab = "Sepal Length", col = c("lightblue", "lightgreen", "lightpink"))

T-Test

When to use: Compare the means of two independent groups. Why: It tests whether the means of two groups are statistically different, assuming the data is normally distributed. Example: Comparing the average weight of horses on two different diets.

ANOVA (Analysis of Variance)

When to use: Compare the means of three or more independent groups. Why: It determines if there are any statistically significant differences between the means of multiple groups, also assuming normality. Example: Comparing the average performance of horses on three different training regimens.

Kruskal-Wallis Test

When to use: Compare three or more groups when data does not meet ANOVA assumptions (non-normal distribution). Why: It’s a non-parametric test that doesn’t assume normal distribution and can be used for ordinal data. Example: Comparing median race times for horses in different age groups when the data is not normally distributed.

Wilcoxon Rank-Sum Test (Mann-Whitney U Test)

When to use: Compare two independent groups when data does not meet t-test assumptions (non-normal distribution). Why: It’s a non-parametric test for comparing medians between two groups, making it robust against non-normal data. Example: Comparing the endurance levels of two different breeds of horses.

Chi-Square Test

When to use: Examine the relationship between two categorical variables. Why: It tests for independence between categorical variables and is useful for frequency data. Example: Investigating if there’s a relationship between coat color (categorical) and likelihood of winning races (categorical).

Scatter Plot:

Correlation Coefficient (Pearson/Spearman)

When to use: Assess the strength and direction of the linear relationship between two continuous variables. Why: Pearson’s correlation is used for normally distributed data, while Spearman’s correlation is used for non-normally distributed or ordinal data. Example: Analyzing the relationship between horse height and weight.

Linear Regression

When to use: Investigate the linear relationship between an independent variable (predictor) and a dependent variable (outcome). Why: It helps in understanding how the dependent variable changes with respect to the independent variable. Example: Predicting horse performance based on training hours.

Multiple Regression

When to use: Explore relationships involving multiple predictors and one outcome variable. Why: It helps in understanding how multiple factors collectively influence the dependent variable. Example: Predicting horse performance based on training hours, diet, and age.

Non-Linear Regression

When to use: Model relationships where the effect of the predictor on the dependent variable is non-linear. Why: Many real-world relationships are non-linear, and this method can provide a better fit for such data. Example: Modeling growth patterns of foals over time.

Bar Graph:

# Load necessary libraries
library(tidyverse)
library(dplyr)
library(ggplot2)

# Load the iris dataset
data(iris)

# Create a new variable 'size' based on Sepal Length
iris <- iris %>%
  mutate(size = ifelse(Sepal.Length < median(Sepal.Length), "small", "big"))

# Bar plot for the new variable 'size'
ggplot(iris, aes(x = size, fill = Species)) +
  geom_bar(position = "dodge") +
  labs(title = "Count of Small and Big Sepal Lengths by Species", x = "Size", y = "Count") +
  scale_fill_manual(values = c("lightblue", "lightgreen", "lightpink")) +
  theme_minimal()

Chi-Square Test

When to use: Compare the frequencies of different categories in a dataset. Why: It helps determine whether there is a significant association between categorical variables. Example: Examining if there’s a significant difference in the number of horses in different coat color categories.

ANOVA (Analysis of Variance)

When to use: Compare means of a continuous variable across multiple categories. Why: It evaluates whether there are significant differences in means across groups. Example: Comparing average feed intake across different stables.

Kruskal-Wallis Test

When to use: Compare medians of a continuous variable across multiple categories when data is not normally distributed. Why: It’s a non-parametric alternative to ANOVA for non-normal data. Example: Comparing median race times across different age groups.

Code

file.rename("Quarto-Workbook-N1306063.rmarkdown", "Quarto_Workbook_N1306063.rmd")

[1] TRUE

Deciding which statistical test to us

Chi-squared Tests (Week 6)

Chi squared goodness of fit test OR Chi squared test of independence Proportion of flowers that are small, medium, and large.

library(tidyverse)
library(patchwork)

data()
head(iris)

  Sepal.Length Sepal.Width Petal.Length Petal.Width Species  size
1          5.1         3.5          1.4         0.2  setosa small
2          4.9         3.0          1.4         0.2  setosa small
3          4.7         3.2          1.3         0.2  setosa small
4          4.6         3.1          1.5         0.2  setosa small
5          5.0         3.6          1.4         0.2  setosa small
6          5.4         3.9          1.7         0.4  setosa small

flowers <- iris %>%
  mutate(size = cut(Sepal.Length, 
                    breaks = 3, 
                    labels = c("Small", "Medium", "Large"))) %>%
  select(Species, size)
  
  table(flowers) # can see what the counts are for these species

            size
Species      Small Medium Large
  setosa        47      3     0
  versicolor    11     36     3
  virginica      1     32    17

Chi-Squared Goodness of Fit Test

Question: is there a statistically significant difference in the proportion of flowers that are small, medium, and large (alpha = 0.05)

H0: The proportion of flowers that are small, medium sized, and large are equal. H1: The proportion of flowers that are small, medium sized, and large are not equal.

table(flowers$size)


 Small Medium  Large 
    59     71     20

flowers %>%
  select(size) %>% # only looking at size
  table() %>% # pipe that into a table
  chisq.test() # then pipe that into the chi square test


    Chi-squared test for given probabilities

data:  .
X-squared = 28.44, df = 2, p-value = 6.673e-07

# OR:

chisq.test(table(flowers$size)) # gives same result


    Chi-squared test for given probabilities

data:  table(flowers$size)
X-squared = 28.44, df = 2, p-value = 6.673e-07

P-value = 0.0000007 (extremely small). Therefore, very unlikely they will be equal. Statistically significant. Reject H0 - Evidence that the proportions is not equal.

Chi-Squared Test of Independence

H0: The variables are independent

There is no relationship between the variables. Knowing the value of one variable does not help to predict the value of the other variable.

H1: The variables are dependent

There is a relationship between the variables. Knowing the value of one variable does help to predict the value of the other variable.

table(flowers)

            size
Species      Small Medium Large
  setosa        47      3     0
  versicolor    11     36     3
  virginica      1     32    17

chisq.test(table(flowers)) # not just doing size, doing the entire table


    Pearson's Chi-squared test

data:  table(flowers)
X-squared = 111.63, df = 4, p-value < 2.2e-16

# OR: 

flowers %>%
  table() %>% # pipe that into a table
  chisq.test()


    Pearson's Chi-squared test

data:  .
X-squared = 111.63, df = 4, p-value < 2.2e-16

# OR:

table(flowers) %>%
  chisq.test()


    Pearson's Chi-squared test

data:  .
X-squared = 111.63, df = 4, p-value < 2.2e-16

P-value is still very very small. Reject H0, statistically significant. There is a difference.

Fishers Exact Test

If > 20% of expected values are < 5 OR all are if any values are of < 5 in a 2X2

flowers %>%
  table() %>% 
  chisq.test() %>%
  .$expected # gives expected values - none are less than 5 so we do not need to use Fishers Exact test

            size
Species         Small   Medium    Large
  setosa     19.66667 23.66667 6.666667
  versicolor 19.66667 23.66667 6.666667
  virginica  19.66667 23.66667 6.666667

T-tests (Week 7)

library(tidyverse)
#install.packages("patchwork")
#install.packages("gapminder")
library(patchwork)
library(gapminder)

data()
View(gapminder)

Single sample t-test

Hypothesis testing

H0: The mean life expectancy is 50 years H1: The mean life expectancy is not 50 years

Observation: Sample data provides a mean life expectancy of 48.9. Is this statistically significant?

gapminder %>%
  filter(continent == "Africa") %>% # Only want to extract African data
  select(lifeExp) %>% # Selecting the specific variable for life expectancy
  t.test(mu = 50) # mu = population mean, assumed mean is 50 - testsing if it is significant against that value


    One Sample t-test

data:  .
t = -3.0976, df = 623, p-value = 0.002038
alternative hypothesis: true mean is not equal to 50
95 percent confidence interval:
 48.14599 49.58467
sample estimates:
mean of x 
 48.86533

P-value = probability we would get a sample that is different from the selected. Usually use 5% or 0.05 as the cut off for significance. This value is way under 0.05, therefore we can reject the null hypothesis. Confidence interval also does not include the number 50, once again proving the significance.

my_ttest <- gapminder %>% # creating an object
  filter(continent == "Africa") %>% 
  select(lifeExp) %>% 
  t.test(mu = 50)

attributes(my_ttest) # gives you specific attributes you can call for from the data, listed in the names section

$names
 [1] "statistic"   "parameter"   "p.value"     "conf.int"    "estimate"   
 [6] "null.value"  "stderr"      "alternative" "method"      "data.name"  

$class
[1] "htest"

my_ttest$p.value # specifically calling for p.value

[1] 0.002038444

Two sided test for difference of means

gapminder %>%
  filter(continent %in% c("Africa", "Europe")) %>%
  t.test(lifeExp ~ continent, data = ., # piping data into the t-test
  alternative = "two.sided") # is the default anyways


    Welch Two Sample t-test

data:  lifeExp by continent
t = -49.551, df = 981.2, p-value < 2.2e-16
alternative hypothesis: true difference in means between group Africa and group Europe is not equal to 0
95 percent confidence interval:
 -23.95076 -22.12595
sample estimates:
mean in group Africa mean in group Europe 
            48.86533             71.90369

2 sided = We’re asking is there a difference between the average life expectancy of people in Africa or Europe in any direction. Bigger or smaller. Is the difference significantly different from zero? P-value is very very small, therefore significantly different. Reject null hypothesis. Statistically significant.

One-sided test for difference of means

Giving direction to our question, not simply asking is there a difference. Want to know if Ireland has a lower life expectancy than Switzerland.

HO: There is no difference in life expectancy between S & I. (HO: S = I = 0) H1: Life expectancy in Ireland is less than Switzerland (HO: S > I) Alpha = 0.05

gapminder %>%
  filter(country %in% c("Ireland", "Switzerland")) %>%
  t.test(lifeExp ~ country, data = . ,
         alternative = "less", # the alternative to the H0 is less, is the difference less
         conf.level = 0.95) # can change this, but this is the default - would still work wihtout this confidence


    Welch Two Sample t-test

data:  lifeExp by country
t = -1.6337, df = 21.77, p-value = 0.05835
alternative hypothesis: true difference in means between group Ireland and group Switzerland is less than 0
95 percent confidence interval:
      -Inf 0.1313697
sample estimates:
    mean in group Ireland mean in group Switzerland 
                 73.01725                  75.56508

It is not statistically significant (p = 0.058). There is no difference in average life expectancy. Failed to reject H0.

Paired t-test

Sampled data where there is a paired counter-part. Data looking at exact same countries but in different years.

gapminder %>%
  filter(year %in% c(1957, 2007)& # only want data from these 2 specific years, double filter
           continent == "Africa") %>%
  mutate(year = factor(year, levels = c(2007, 1957))) %>% # overwriting the existing variables, making the levels 2007 and 1956 (subrtacting 2007 from 1957)
  t.test(lifeExp ~ year, data = .)


    Welch Two Sample t-test

data:  lifeExp by year
t = 8.7561, df = 82.126, p-value = 2.175e-13
alternative hypothesis: true difference in means between group 2007 and group 1957 is not equal to 0
95 percent confidence interval:
 10.46364 16.61575
sample estimates:
mean in group 2007 mean in group 1957 
          54.80604           41.26635

# getting error messages for this code when I add in: paired = TRUE?

Assumptions of the t-test

Large, representative sample
Values or normally distributed
Two samples have very similar variance

How to check variance:

var(gapminder$lifeExp[gapminder$country=="Ireland"])

[1] 13.09338

var(gapminder$lifeExp[gapminder$country=="Switzerland"])

[1] 16.09271

Variances are very close close between the two, therefore, you can run a t-test for them.

ANOVA (Week 7)

Analyse your data - analysis of variance. Can’t use a t-test if there are 3 or more variables, it cannot cope with this. Therefore, we need to conduct an ANOVA. BUT the underlying principle about H0 does NOT change.

H0: There is no difference between life expectancy of different countries. H1: There is a difference, not equal. Alpha = 0.05

Create a data set to work with

library(tidyverse)
library(patchwork)
library(gapminder)
library(forcats)

data() # calling the gapminder data
head(gapminder)

# A tibble: 6 × 6
  country     continent  year lifeExp      pop gdpPercap
  <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
1 Afghanistan Asia       1952    28.8  8425333      779.
2 Afghanistan Asia       1957    30.3  9240934      821.
3 Afghanistan Asia       1962    32.0 10267083      853.
4 Afghanistan Asia       1967    34.0 11537966      836.
5 Afghanistan Asia       1972    36.1 13079460      740.
6 Afghanistan Asia       1977    38.4 14880372      786.

gapdata <- gapminder %>% # creating a new object, calling it gapdata, %>% = 'and then'
  filter(year == 2007 & # '==' means we are asking a question, does the value include 2007, if it does then include the observation
           continent %in% c("Americas", "Europe", "Asia")) %>% # AND second condition
  select(continent, lifeExp)

Take a look at the distribution of means

gapdata %>%
  group_by(continent) %>%
  summarise(Mean_life = mean(lifeExp)) %>%
  arrange(Mean_life)

# A tibble: 3 × 2
  continent Mean_life
  <fct>         <dbl>
1 Asia           70.7
2 Americas       73.6
3 Europe         77.6

Research Question: Is the life expectancy in these 3 continents different.

Hypothesis testing: H0: Mean life expectancy is the same. H1: Mean life expectancy is not the same.

Observation: Difference in mean is observed in the sample data, but is this statistically significant (alpha = 0.05).

Create ANOVA model

gapdata %>% # using named data
  aov(lifeExp ~ continent, data = .) %>% # aov = ANOVA, life expectancy by continent
  summary()

            Df Sum Sq Mean Sq F value   Pr(>F)    
continent    2  755.6   377.8   11.63 3.42e-05 ***
Residuals   85 2760.3    32.5                     
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

aov_model <- gapdata %>%
  aov(lifeExp ~ continent, data = .)

P-value is NB. Probability is extremely low, way less than 0.05. Therefore, we reject the H0. It is significantly different. BUT which one is significantly different?

gapdata %>% 
  aov(lifeExp ~ continent, data = .) %>%
  TukeyHSD() # honestly significant different, considers the differences between each continent and gives a confidence interval. If it includes zero in the internal then that means there would be no significant difference. However, Europe and America does not include zero, nor does Europe and Asia - therefore there are significant differences. Reject H0.

  Tukey multiple comparisons of means
    95% family-wise confidence level

Fit: aov(formula = lifeExp ~ continent, data = .)

$continent
                     diff        lwr        upr     p adj
Asia-Americas   -2.879635 -6.4839802  0.7247099 0.1432634
Europe-Americas  4.040480  0.3592746  7.7216854 0.0279460
Europe-Asia      6.920115  3.4909215 10.3493088 0.0000189

Considers the differences between each continent and gives a confidence interval. If it includes zero in the internal then that means there would be no significant difference. However, Europe and America does not include zero, nor does Europe and Asia - therefore there are significant differences. Reject H0.

Correlation - r (Pre-session Week 8)

Absolute value of r |r|

0 < |r| < 0.1 = very weak
0.1 < |r| < 0.2 = weak
0.2 < |r| < 0.3 = moderate
|r| > 0.3 = strong

NOTE: r is NOT the slope! The correlation coefficient tells us whether it is possible to predict one variable based on the other, but it does NOT tell us how to make that prediction.

r ONLY represents linear relationships. Non-linear correlations can be strong, too. But this strength is not picked up by r very well.

r could be unduly influenced by outliers.

Exercises

5.9.1) b

5.9.2) a

5.9.3) a or c? it is NEAR perfect

5.9.5.1) This is a strong positive correlation. It means that individuals who have higher reading scores are very likely to have a higher IQ test result.

5.9.5.2) This is a strong negative correlation. It means that students who have good citizenship are very likely to be less aggressive. This correlation is not has strong and is thus not as related.

5.9.5.3) There is practically no correlation between weights and sociability. It means that heavier people are equally likely to be as social as light people.

Simple Linear Model (Pre-session Week 9)

Two numeric values e.g. speed of a car vs. distance taken to stop.

Y-intercept
Slope: as long as you have the y-int and the slope, you can find the line and can predict any values for x & y
p: if p is extremely small, then can reject H0 and can assume the slope to be zero.
R2: proportion between 0 and 1; values on y-axis that can be explained by the values/changes on the x-axis. If we multiply by 100, we get a %.

library(tidyverse)
data()
head(cars, 10) # calling cars data, want the first 10 values

   speed dist
1      4    2
2      4   10
3      7    4
4      7   22
5      8   16
6      9   10
7     10   18
8     10   26
9     10   34
10    11   17

cars %>%
  lm(dist ~ speed, data = .) %>% # lm = function for linear models, distance = dependent variable (y) first, then speed = independent variable (x)
  summary()


Call:
lm(formula = dist ~ speed, data = .)

Residuals:
    Min      1Q  Median      3Q     Max 
-29.069  -9.525  -2.272   9.215  43.201 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -17.5791     6.7584  -2.601   0.0123 *  
speed         3.9324     0.4155   9.464 1.49e-12 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 15.38 on 48 degrees of freedom
Multiple R-squared:  0.6511,    Adjusted R-squared:  0.6438 
F-statistic: 89.57 on 1 and 48 DF,  p-value: 1.49e-12

# Another way to do it:
mod <- lm(dist ~ speed, data = cars) # name a new data set
mod


Call:
lm(formula = dist ~ speed, data = cars)

Coefficients:
(Intercept)        speed  
    -17.579        3.932

summary(mod)


Call:
lm(formula = dist ~ speed, data = cars)

Residuals:
    Min      1Q  Median      3Q     Max 
-29.069  -9.525  -2.272   9.215  43.201 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -17.5791     6.7584  -2.601   0.0123 *  
speed         3.9324     0.4155   9.464 1.49e-12 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 15.38 on 48 degrees of freedom
Multiple R-squared:  0.6511,    Adjusted R-squared:  0.6438 
F-statistic: 89.57 on 1 and 48 DF,  p-value: 1.49e-12

attributes(mod) # can see all the attributes for this model

$names
 [1] "coefficients"  "residuals"     "effects"       "rank"         
 [5] "fitted.values" "assign"        "qr"            "df.residual"  
 [9] "xlevels"       "call"          "terms"         "model"        

$class
[1] "lm"

mod$residuals # can see all the individual values for the residuals

         1          2          3          4          5          6          7 
  3.849460  11.849460  -5.947766  12.052234   2.119825  -7.812584  -3.744993 
         8          9         10         11         12         13         14 
  4.255007  12.255007  -8.677401   2.322599 -15.609810  -9.609810  -5.609810 
        15         16         17         18         19         20         21 
 -1.609810  -7.542219   0.457781   0.457781  12.457781 -11.474628  -1.474628 
        22         23         24         25         26         27         28 
 22.525372  42.525372 -21.407036 -15.407036  12.592964 -13.339445  -5.339445 
        29         30         31         32         33         34         35 
-17.271854  -9.271854   0.728146 -11.204263   2.795737  22.795737  30.795737 
        36         37         38         39         40         41         42 
-21.136672 -11.136672  10.863328 -29.069080 -13.069080  -9.069080  -5.069080 
        43         44         45         46         47         48         49 
  2.930920  -2.933898 -18.866307  -6.798715  15.201285  16.201285  43.201285 
        50 
  4.268876

hist(mod$residuals)

Residuals: Distance from best fit line and dots on graph, actual plots. In this example our inter-quartile range is reasonably evenly distributed around 0 (-9.53 to 9.27). Use residuals to determine the strength of your model.

Coefficients: Need an intercept if you’re going to have a model. Intercept can be meaningless, cannot realistically have -17m but still needs to exist for the graph to work and be created. The coefficient below - speed - gives the slope which is important to us (3.93). Used to predict y-values as a function of x. P-value for our slope is seen at the end and is extremely small. Reject H0. Accept that this is statistically significant.

Multiple R-squared: 0.65, don’t need an adjusted R-squared value for a simple single regression model HOWEVER would use it in a multi-variant model with more than one variable.

new_speeds <- data.frame(speed = c(10, 15, 20)) # can use the model to predict things, list a bunch of new speeds (x-axis, independent variables) create new object.

Regression Notes

Regression is a means of exploring the variation in some quantity.

The variation is separated into explained/ independent variable/ x and unexplained/ dependent variable/ y components.

yi = b0 + b1xi + ei

dependent variable = y-intercept + (slope X independent variable) + error

Linear relationship
BUT error term makes it a regression
Estimating the “betas”
Quantifying the errors
Min sum of (error terms) squared

SSR/SSE/SST

Total variance in ice cream sales = SST = Sum of Squares (Total):

Variance explained by the temperature = SSR = Sum of Squares due to Regression
Variance still unexplained = SSE = Sum of Squares due to Error

SST = SSR + SSE

R-Squared

R-squared is the proportion of the VARIATION in the Y-variable being explained by the VARIATION in the X-variables.

R^2 = SSR/SST OR SSR/(SSR + SSE)

Low SSE = High R squared (inversely proportional)
High SSE = Low R squared
Low R^2 = Weaker relationship

QUESTION: What is the minimum number of data points you need to run a regression?

degrees of freedom (df) = n - k - 1

n = No. of observations
k = no. of x variables
By adding more explanatory variables, degrees of freedom are reduced.
Thus, the opportunity for “error” in the model is reduced.
df = how much error can we potentially show in this model.

Adjusted R-squared

Adding more x- variables to our model e.g. ice cream sales affected by temperature (k = 1); temp. & rain (k = 2); temp, rain, holidays (k = 3); temp, rain, holidays, moon phase (k = 4)

R^2 will naturally increase, irrespective of how useless they are to the model, because it thinks we are making our results more reliable by adding more variables but this is not the case.
We need to use and adjusted R^2 for more than 1 variable to get more accurate R values.
Adjusts for the fact that we are losing more degrees of freedom.

Data Analysis Task (Week 9)

library(tidyverse)
# Load the mtcars dataset
data(mtcars)

# Fit a linear model
model <- lm(mpg ~ wt, data = mtcars)

# Display the summary of the model
summary(model)


Call:
lm(formula = mpg ~ wt, data = mtcars)

Residuals:
    Min      1Q  Median      3Q     Max 
-4.5432 -2.3647 -0.1252  1.4096  6.8727 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  37.2851     1.8776  19.858  < 2e-16 ***
wt           -5.3445     0.5591  -9.559 1.29e-10 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 3.046 on 30 degrees of freedom
Multiple R-squared:  0.7528,    Adjusted R-squared:  0.7446 
F-statistic: 91.38 on 1 and 30 DF,  p-value: 1.294e-10

# Perform ANOVA on the model
anova(model)

Analysis of Variance Table

Response: mpg
          Df Sum Sq Mean Sq F value    Pr(>F)    
wt         1 847.73  847.73  91.375 1.294e-10 ***
Residuals 30 278.32    9.28                      
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

1. Key outputs from the model summary:

Intercept (37.28): This is the predicted miles per gallon (mpg) when the weight (wt) of the car is zero. While not realistic (a car cannot have zero weight), this serves as the baseline for interpretation. Slope (−5.35): For every unit increase in weight (wt), mpg decreases by 5.35. This indicates a negative relationship between vehicle weight and fuel efficiency. 𝑅^2 = 0.753: About 75.3% of the variability in mpg is explained by the weight of the car. F-statistic (94.64, p<0.001): This confirms the model’s predictive ability is statistically significant.

2. ANOVA Table: The ANOVA results test whether the weight significantly affects mpg:

F-statistic: 94.64 with a p-value of 6.16 x 10^-11, indicating a strong statistical significance. Sum of Squares for wt (851.10): This represents the variability in mpg explained by weight. Residual Sum of Squares (278.79): The remaining unexplained variability in mpg.

Interpretation: Weight’s Effect on Efficiency: Heavier cars are significantly less fuel-efficient, as evidenced by the negative slope (−5.35) and the strong statistical significance (p < 0.001). Model Fit: The model explains a substantial portion of the variability in mpg, making weight a critical factor in determining fuel efficiency.

Research Methods Task (Week 9)

Logistic Regression (Week 10 Pre-session)

library(ggplot2)
library(cowplot)


Attaching package: 'cowplot'

The following object is masked from 'package:patchwork':

    align_plots

The following object is masked from 'package:lubridate':

    stamp

url <- "http://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
data <- read.csv(url, header = FALSE)
head(data) # Head() function shows us the first 6 rows of data

  V1 V2 V3  V4  V5 V6 V7  V8 V9 V10 V11 V12 V13 V14
1 63  1  1 145 233  1  2 150  0 2.3   3 0.0 6.0   0
2 67  1  4 160 286  0  2 108  1 1.5   2 3.0 3.0   2
3 67  1  4 120 229  0  2 129  1 2.6   2 2.0 7.0   1
4 37  1  3 130 250  0  0 187  0 3.5   3 0.0 3.0   0
5 41  0  2 130 204  0  2 172  0 1.4   1 0.0 3.0   0
6 56  1  2 120 236  0  0 178  0 0.8   1 0.0 3.0   0

# Unfortunately, columns were not named, so rename them according to UCI website:
colnames(data) <- c(
  "age",
  "sex",
  "cp",
  "trestbps",
  "chol",
  "fbs",
  "restecg",
  "thalach",
  "exang",
  "oldpeak",
  "slope",
  "ca",
  "thal",
  "hd"
)

head(data) # No when we look at first 6 rows with head() things look a lot better

  age sex cp trestbps chol fbs restecg thalach exang oldpeak slope  ca thal hd
1  63   1  1      145  233   1       2     150     0     2.3     3 0.0  6.0  0
2  67   1  4      160  286   0       2     108     1     1.5     2 3.0  3.0  2
3  67   1  4      120  229   0       2     129     1     2.6     2 2.0  7.0  1
4  37   1  3      130  250   0       0     187     0     3.5     3 0.0  3.0  0
5  41   0  2      130  204   0       2     172     0     1.4     1 0.0  3.0  0
6  56   1  2      120  236   0       0     178     0     0.8     1 0.0  3.0  0

str(data) # However, the str() function, which describes the **str**uctutre of the data, tells us that some of the columns are messed up

'data.frame':   303 obs. of  14 variables:
 $ age     : num  63 67 67 37 41 56 62 57 63 53 ...
 $ sex     : num  1 1 1 1 0 1 0 0 1 1 ...
 $ cp      : num  1 4 4 3 2 2 4 4 4 4 ...
 $ trestbps: num  145 160 120 130 130 120 140 120 130 140 ...
 $ chol    : num  233 286 229 250 204 236 268 354 254 203 ...
 $ fbs     : num  1 0 0 0 0 0 0 0 0 1 ...
 $ restecg : num  2 2 2 0 2 0 2 0 2 2 ...
 $ thalach : num  150 108 129 187 172 178 160 163 147 155 ...
 $ exang   : num  0 1 1 0 0 0 0 1 0 1 ...
 $ oldpeak : num  2.3 1.5 2.6 3.5 1.4 0.8 3.6 0.6 1.4 3.1 ...
 $ slope   : num  3 2 2 3 1 1 3 1 2 3 ...
 $ ca      : chr  "0.0" "3.0" "2.0" "0.0" ...
 $ thal    : chr  "6.0" "3.0" "7.0" "3.0" ...
 $ hd      : int  0 2 1 0 0 0 3 0 2 1 ...

# Right now sex is a number, but is supposed to be a factor, where 0 represents "female" and 1 represents "male"
# CP is also supposed to be a factor

# Clean-up:

data[data == "?"] <- NA # Rename question marks NA
data[data$sex == 0,]$sex <- F # make data easier to read, convert 0s to F

data$sex <- as.factor(data$sex) # convert the column's into a factor
data$cp <- as.factor(data$cp)
data$fbs <- as.factor(data$fbs)
data$restecg <- as.factor(data$restecg)
data$exang <- as.factor(data$exang)
data$slope <- as.factor(data$slope)

data$ca <- as.integer(data$ca) # thinks its a column of strings, tell R that its a column of integers
data$ca <- as.factor(data$ca) # then say its a factor

data$thal <- as.integer(data$thal) # thinks its a column of strings, tell R that its a column of integers
data$thal <- as.factor(data$thal) # then say its a factor

data$hd <- ifelse(test=data$hd == 0, yes="Healthy", no="Unhealthy") # convert 0s to healthy and 1s to unealthy, convert no.s to words
data$hd <- as.factor(data$hd)

str(data)

'data.frame':   303 obs. of  14 variables:
 $ age     : num  63 67 67 37 41 56 62 57 63 53 ...
 $ sex     : Factor w/ 2 levels "0","1": 2 2 2 2 1 2 1 1 2 2 ...
 $ cp      : Factor w/ 4 levels "1","2","3","4": 1 4 4 3 2 2 4 4 4 4 ...
 $ trestbps: num  145 160 120 130 130 120 140 120 130 140 ...
 $ chol    : num  233 286 229 250 204 236 268 354 254 203 ...
 $ fbs     : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 1 1 1 2 ...
 $ restecg : Factor w/ 3 levels "0","1","2": 3 3 3 1 3 1 3 1 3 3 ...
 $ thalach : num  150 108 129 187 172 178 160 163 147 155 ...
 $ exang   : Factor w/ 2 levels "0","1": 1 2 2 1 1 1 1 2 1 2 ...
 $ oldpeak : num  2.3 1.5 2.6 3.5 1.4 0.8 3.6 0.6 1.4 3.1 ...
 $ slope   : Factor w/ 3 levels "1","2","3": 3 2 2 3 1 1 3 1 2 3 ...
 $ ca      : Factor w/ 4 levels "0","1","2","3": 1 4 3 1 1 1 3 1 2 1 ...
 $ thal    : Factor w/ 3 levels "3","6","7": 2 1 3 1 1 1 1 1 3 3 ...
 $ hd      : Factor w/ 2 levels "Healthy","Unhealthy": 1 2 2 1 1 1 2 1 2 2 ...

nrow(data[is.na(data$ca) | is.na(data$thal),]) # remove the 6 samples (rows) that have NA

[1] 6

data[is.na(data$ca) | is.na(data$thal),]

    age sex cp trestbps chol fbs restecg thalach exang oldpeak slope   ca thal
88   53   0  3      128  216   0       2     115     0     0.0     1    0 <NA>
167  52   1  3      138  223   0       0     169     0     0.0     1 <NA>    3
193  43   1  4      132  247   1       2     143     1     0.1     2 <NA>    7
267  52   1  4      128  204   1       0     156     1     1.0     2    0 <NA>
288  58   1  2      125  220   0       0     144     0     0.4     2 <NA>    7
303  38   1  3      138  175   0       0     173     0     0.0     1 <NA>    3
           hd
88    Healthy
167   Healthy
193 Unhealthy
267 Unhealthy
288   Healthy
303   Healthy

nrow(data)

[1] 303

data <- data[is.na(data$ca) | is.na(data$thal),]
nrow(data)

[1] 6

#####################################
##
## Now we can do some quality control by making sure all of the factor
## levels are represented by people with and without heart disease (hd)
##
## NOTE: We also want to exclude variables that only have 1 or 2 samples in
## a category since +/- one or two samples can have a large effect on the
## odds/log(odds)
##
##
#####################################
xtabs(~ hd + sex, data=data)

           sex
hd          0 1
  Healthy   1 3
  Unhealthy 0 2

xtabs(~ hd + cp, data=data)

           cp
hd          1 2 3 4
  Healthy   0 1 3 0
  Unhealthy 0 0 0 2

xtabs(~ hd + fbs, data=data)

           fbs
hd          0 1
  Healthy   4 0
  Unhealthy 0 2

xtabs(~ hd + restecg, data=data)

           restecg
hd          0 1 2
  Healthy   3 0 1
  Unhealthy 1 0 1

xtabs(~ hd + exang, data=data)

           exang
hd          0 1
  Healthy   4 0
  Unhealthy 0 2

xtabs(~ hd + slope, data=data)

           slope
hd          1 2 3
  Healthy   3 1 0
  Unhealthy 0 2 0

xtabs(~ hd + ca, data=data)

           ca
hd          0 1 2 3
  Healthy   1 0 0 0
  Unhealthy 1 0 0 0

xtabs(~ hd + thal, data=data)

           thal
hd          3 6 7
  Healthy   2 0 1
  Unhealthy 0 0 1

#####################################
##
## Now we are ready for some logistic regression. First we'll create a very
## simple model that uses sex to predict heart disease
##
#####################################
 
## let's start super simple and see if sex (female/male) is a good
## predictor...
## First, let's just look at the raw data...

logistic <- glm(hd ~ sex, data=data, family="binomial") # calling logistic
summary(logistic) # get details of logistic


Call:
glm(formula = hd ~ sex, family = "binomial", data = data)

Coefficients:
            Estimate Std. Error z value Pr(>|z|)
(Intercept)   -18.57    6522.64  -0.003    0.998
sex1           18.16    6522.64   0.003    0.998

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 7.6382  on 5  degrees of freedom
Residual deviance: 6.7301  on 4  degrees of freedom
AIC: 10.73

Number of Fisher Scoring iterations: 17

# look good as they are close around zero and rather symmetric
## Now calculate the overall "Pseudo R-squared" and its p-value
ll.null <- logistic$null.deviance/-2
ll.proposed <- logistic$deviance/-2
 
## McFadden's Pseudo R^2 = [ LL(Null) - LL(Proposed) ] / LL(Null)
(ll.null - ll.proposed) / ll.null

[1] 0.1188836

## The p-value for the R^2
1 - pchisq(2*(ll.proposed - ll.null), df=(length(logistic$coefficients)-1))

[1] 0.3406315

## now we can plot the data
predicted.data <- data.frame(
  probability.of.hd=logistic$fitted.values,
  hd=data$hd)
 
predicted.data <- predicted.data[
  order(predicted.data$probability.of.hd, decreasing=FALSE),]
predicted.data$rank <- 1:nrow(predicted.data)
 
## Lastly, we can plot the predicted probabilities for each sample having
## heart disease and color by whether or not they actually had heart disease
ggplot(data=predicted.data, aes(x=rank, y=probability.of.hd)) +
  geom_point(aes(color=hd), alpha=1, shape=4, stroke=2) +
  xlab("Index")

  ylab("Predicted probability of getting heart disease")

$y
[1] "Predicted probability of getting heart disease"

attr(,"class")
[1] "labels"

ggsave("heart_disease_probabilities.pdf")

Saving 7 x 5 in image