homework3

Testing

# Programmer: Sarah Akhtar, Date: February 7th, 2025

# 1. Load the tidyverse library
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

# 2. Download coasters.csv from dataset and load into R variable
getwd()

## [1] "/Users/sarahak1786/Downloads"

setwd("/Users/sarahak1786/Downloads")
coaster <- read_csv("coasters.csv")

## Rows: 751 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): coaster_name, Location, Status, Type_Main
## dbl (7): Length, Height, Inversions, Duration, Drop, year_introduced, speed
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

print(coaster)

## # A tibble: 751 × 11
##    coaster_name          Length Location Status Height Inversions Duration  Drop
##    <chr>                  <dbl> <chr>    <chr>   <dbl>      <dbl>    <dbl> <dbl>
##  1 Switchback Railway       600 Coney I… Remov…   50           NA       60    43
##  2 Leap-The-Dips           1452 Lakemon… Opera…   41           NA       60     9
##  3 Racer (1910 roller c…   4500 Kennywo… Opera…   72.5          0       92    50
##  4 Zippin Pippin           2865 Other    <NA>     70           NA      118    70
##  5 The Wild One (roller…   4000 Six Fla… <NA>     98            0      112    88
##  6 Jack Rabbit (Kennywo…   2132 Kennywo… Opera…   40            0       96    70
##  7 Jack Rabbit (Seabree…   2130 Seabree… Opera…   75           NA       NA    NA
##  8 Scenic Railway (roll…   3000 Dreamla… Opera…   40           NA       NA    40
##  9 Roller Coaster (Lago…   2500 Lagoon … Opera…   62            0       NA    NA
## 10 Big Dipper (Blackpoo…   3300 Blackpo… Opera…   65            0      148    50
## # ℹ 741 more rows
## # ℹ 3 more variables: year_introduced <dbl>, Type_Main <chr>, speed <dbl>

# 3. Use the str() function to get more information about the dataframe
# Answer: There are 751 rows and 11 columns in the dataset.
str(coaster)

## spc_tbl_ [751 × 11] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ coaster_name   : chr [1:751] "Switchback Railway" "Leap-The-Dips" "Racer (1910 roller coaster)" "Zippin Pippin" ...
##  $ Length         : num [1:751] 600 1452 4500 2865 4000 ...
##  $ Location       : chr [1:751] "Coney Island" "Lakemont Park" "Kennywood" "Other" ...
##  $ Status         : chr [1:751] "Removed" "Operating" "Operating" NA ...
##  $ Height         : num [1:751] 50 41 72.5 70 98 40 75 40 62 65 ...
##  $ Inversions     : num [1:751] NA NA 0 NA 0 0 NA NA 0 0 ...
##  $ Duration       : num [1:751] 60 60 92 118 112 96 NA NA NA 148 ...
##  $ Drop           : num [1:751] 43 9 50 70 88 70 NA 40 NA 50 ...
##  $ year_introduced: num [1:751] 1884 1902 1910 1912 1917 ...
##  $ Type_Main      : chr [1:751] "Wood" "Wood" "Wood" "Wood" ...
##  $ speed          : num [1:751] 6 10 40 40 53 45 42 35 45 40 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   coaster_name = col_character(),
##   ..   Length = col_double(),
##   ..   Location = col_character(),
##   ..   Status = col_character(),
##   ..   Height = col_double(),
##   ..   Inversions = col_double(),
##   ..   Duration = col_double(),
##   ..   Drop = col_double(),
##   ..   year_introduced = col_double(),
##   ..   Type_Main = col_character(),
##   ..   speed = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

# 4. Classify each variable as categorical or quantitative
# Answer: Categorical Variables: coaster_name, Location, Status, Type_Main
#         Quantitative Variables: Length, Height, Inversions, Duration, 
#                                 Drop, year_introduced, speed

# 5. Print the 5th through 10th values of the column coaster_name
coaster$coaster_name[5:10]

## [1] "The Wild One (roller coaster)"        
## [2] "Jack Rabbit (Kennywood)"              
## [3] "Jack Rabbit (Seabreeze)"              
## [4] "Scenic Railway (roller coaster)"      
## [5] "Roller Coaster (Lagoon)"              
## [6] "Big Dipper (Blackpool Pleasure Beach)"

# 6. Use the table() function on the Status column of the dataframe
table(coaster$Status)

## 
## Operating   Removed 
##       471        97

# 7. Create a data frame sub_coasters that contains only Length/Height
sub_coasters <- coaster[, c("Length", "Height")]
print(sub_coasters)

## # A tibble: 751 × 2
##    Length Height
##     <dbl>  <dbl>
##  1    600   50  
##  2   1452   41  
##  3   4500   72.5
##  4   2865   70  
##  5   4000   98  
##  6   2132   40  
##  7   2130   75  
##  8   3000   40  
##  9   2500   62  
## 10   3300   65  
## # ℹ 741 more rows

# 8. Write the first 10 rows of sub_coasters to a csv file and submit the output
first_sub_coasters <- sub_coasters[1:10,]
print(first_sub_coasters)

## # A tibble: 10 × 2
##    Length Height
##     <dbl>  <dbl>
##  1    600   50  
##  2   1452   41  
##  3   4500   72.5
##  4   2865   70  
##  5   4000   98  
##  6   2132   40  
##  7   2130   75  
##  8   3000   40  
##  9   2500   62  
## 10   3300   65

write_csv(first_sub_coasters, "homework3_coasters.csv")

# 9. Create a bar plot using geom_bar() of the Inversions variable
ggplot(coaster, aes(Inversions)) +
  geom_bar()

## Warning: Removed 74 rows containing non-finite outside the scale range
## (`stat_count()`).

ggsave("plot1.png")

## Saving 7 x 5 in image

## Warning: Removed 74 rows containing non-finite outside the scale range
## (`stat_count()`).

# 10. Create a scatterplot using geom_point() showing the relationship
# Answer: It seems that as the length of the ride increases, the duration also
#         increases. The relationship seems to show a direction correlation that
#         follows a linear path.
ggplot(coaster, aes(Length, Duration)) +
  geom_point()

## Warning: Removed 174 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggsave("plot2.png")

## Saving 7 x 5 in image

## Warning: Removed 174 rows containing missing values or values outside the scale range
## (`geom_point()`).

# 11. Create a histogram using geom_histogram() of the year_introduced variable
ggplot(coaster, aes(year_introduced)) +
  geom_histogram() +
  facet_wrap(~Type_Main)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggsave("plot3.png")

## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# 12. Create a scatterplot showing the relationship between year_introduced/speed
# Answer: From this graph, we can see that as time passes, the speed of coasters
#         increases along with the height and drop length. In addition to this, 
#         we see that recent coasters are often made with steel rather than wood.
ggplot(coaster, aes(year_introduced, speed, 
                    shape = Type_Main, 
                    color = Drop, 
                    size = Height)) +
  geom_point()

ggsave("plot4.png")

## Saving 7 x 5 in image

# 13. Create another visualization that highlights a notable trend or pattern
ggplot(coaster, aes(Height, speed, color = Type_Main)) +
  geom_point() +
  facet_wrap(~Type_Main) + 
  geom_smooth(method="lm")

## `geom_smooth()` using formula = 'y ~ x'

# 14. Explain your interpretation of your plot from question 13
# Answer: From the plot, we can see that as the height of the coaster increases,
#         the speed also increases. I also separated the coasters by main type
#         to demonstrate that steel coasters are not only abundant, but the top
#         speeds are much higher between the other and wood types of coasters.

# 15. Turn in your R script file as well as the plots you generated
# [COMPLETED]

homework3

2025-02-13

Testing