rm(list = ls())
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 535725 28.7 1195934 63.9 NA 669417 35.8
## Vcells 988324 7.6 8388608 64.0 16384 1852038 14.2
cat("\f")
dev.off
## function (which = dev.cur())
## {
## if (which == 1)
## stop("cannot shut down device 1 (the null device)")
## .External(C_devoff, as.integer(which))
## dev.cur()
## }
## <bytecode: 0x11306c4e0>
## <environment: namespace:grDevices>
# first we set the working directory
setwd("/Users/ginaocchipinti/Documents/ADEC 7310 Data Analytics/Week 6")
# then pull up the data from our WD and assign it to variables resorts and snow for merging
read.csv("data_dictionary.csv")
## Table Field
## 1 Resorts ID
## 2 Resorts Resort
## 3 Resorts Latitude
## 4 Resorts Longitude
## 5 Resorts Country
## 6 Resorts Continent
## 7 Resorts Price
## 8 Resorts Season
## 9 Resorts Highest point
## 10 Resorts Lowest point
## 11 Resorts Beginner slopes
## 12 Resorts Intermediate slopes
## 13 Resorts Difficult slopes
## 14 Resorts Total slopes
## 15 Resorts Longest run
## 16 Resorts Snow cannons
## 17 Resorts Surface lifts
## 18 Resorts Chair lifts
## 19 Resorts Gondola lifts
## 20 Resorts Total lifts
## 21 Resorts Lift capacity
## 22 Resorts Child friendly
## 23 Resorts Snowparks
## 24 Resorts Nightskiing
## 25 Resorts Summer skiing
## 26 Snow Month
## 27 Snow Latitude
## 28 Snow Longitude
## 29 Snow Snow
## Description
## 1 Unique identifier for each resort
## 2 Name of the ski & snowboard resort
## 3 Latitude for the resort's location
## 4 Longitude for the resort's location
## 5 Country in which the resort is located
## 6 Continent in which the resort is located
## 7 Ski pass cost for 1 adult for 1 day in the main season (Euro - \x80)
## 8 Normal start and end of the ski season at the resort (note that it allways will depend on the weather and snowfall)
## 9 Highest mountain point at the resort (meters)
## 10 Lowest possible point to ski at the resort (meters)
## 11 Total length of "children", "blue", and "green" slopes at the resort (km)
## 12 Total length of "red" slopes at the resort (km)
## 13 Total length of "black", "advanced", and "expert" slopes at the resort (km)
## 14 Total length of slopes at the resort (km)
## 15 Longest possible continuous run at the ski resort (km)
## 16 Total amount of snow cannons at the resort
## 17 Total number of surface lifts, including T-bar, Sunkidslift, Rope lifts and people mover
## 18 Total number of chair lifts
## 19 Total number of gondola lifts, including Gondola, Train lifts, Funicular, Combined gondola and chairlifts, Helicopter lifts, Snowcats and Aerial tramways
## 20 Total number of lifts
## 21 Number of passengers the resort's lift system can move in an hour
## 22 Is the ski resort child friendly?
## 23 Does the resort have one or more snowparks?
## 24 Does the resort offer skiing on illuminated slopes?
## 25 Does the resort offer skiing during the summer?
## 26 Date to represent the month of the year (not just the first day)
## 27 Latitude at the center of the region (every "region" is 0.25x0.25 degrees in size)
## 28 Longitude at the center of the region (every "region" is 0.25x0.25 degrees in size)
## 29 Percent of time the region was covered in snow during the month
resorts <- read.csv("resorts.csv")
snow <- read.csv("snow.csv")
#apply the merge function
resort_snowfall <- merge(resorts, snow, by = c("ID", "ID"))
# create a summary stats table of the resort_snowfall dataset
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
?stargazer
# summary stats for merged dataset
stargazer(resort_snowfall, type = "text")
##
## ==============================================================
## Statistic N Mean St. Dev. Min Max
## --------------------------------------------------------------
## ID 499 250.000 144.193 1 499
## Latitude.x 499 43.206 15.976 -45.055 67.784
## Longitude.x 499 -6.007 59.990 -149.741 176.877
## Price 499 48.721 21.719 0 141
## Highest.point 499 2,160.589 774.340 163 3,914
## Lowest.point 499 1,200.631 596.142 36 3,286
## Beginner.slopes 499 31.820 47.402 0 312
## Intermediate.slopes 499 37.922 44.359 0 239
## Difficult.slopes 499 16.164 20.116 0 126
## Total.slopes 499 85.906 100.874 1 600
## Longest.run 499 3.545 3.948 0 16
## Snow.cannons 499 179.136 372.321 0 2,383
## Surface.lifts 499 11.283 13.365 0 89
## Chair.lifts 499 9.721 11.462 0 74
## Gondola.lifts 499 3.259 5.868 0 40
## Total.lifts 499 24.263 27.673 0 174
## Lift.capacity 499 31,650.960 40,781.990 0 252,280
## Latitude.y 499 62.910 0.099 62.625 63.125
## Longitude.y 499 37.583 105.647 -161.125 178.625
## Snow 499 99.161 3.900 50.390 100.000
## --------------------------------------------------------------
# summary stats table to each individual data set
# stats table for snow
stargazer(snow, type = "text")
##
## ==========================================================
## Statistic N Mean St. Dev. Min Max
## ----------------------------------------------------------
## ID 820,522 410,261.500 236,864.400 1 820,522
## Latitude 820,522 57.623 17.121 -55.375 83.375
## Longitude 820,522 13.286 97.605 -179.875 179.875
## Snow 820,522 78.018 36.042 0.390 100.000
## ----------------------------------------------------------
# stats table for resorts
stargazer(resorts, type = "text")
##
## ==============================================================
## Statistic N Mean St. Dev. Min Max
## --------------------------------------------------------------
## ID 499 250.000 144.193 1 499
## Latitude 499 43.206 15.976 -45.055 67.784
## Longitude 499 -6.007 59.990 -149.741 176.877
## Price 499 48.721 21.719 0 141
## Highest.point 499 2,160.589 774.340 163 3,914
## Lowest.point 499 1,200.631 596.142 36 3,286
## Beginner.slopes 499 31.820 47.402 0 312
## Intermediate.slopes 499 37.922 44.359 0 239
## Difficult.slopes 499 16.164 20.116 0 126
## Total.slopes 499 85.906 100.874 1 600
## Longest.run 499 3.545 3.948 0 16
## Snow.cannons 499 179.136 372.321 0 2,383
## Surface.lifts 499 11.283 13.365 0 89
## Chair.lifts 499 9.721 11.462 0 74
## Gondola.lifts 499 3.259 5.868 0 40
## Total.lifts 499 24.263 27.673 0 174
## Lift.capacity 499 31,650.960 40,781.990 0 252,280
## --------------------------------------------------------------
Helpful resource: https://www.youtube.com/watch?v=R2tBwqCQqAs.
# let's check for a linear relationship between price and total lifts
plot(Total.lifts ~ Price,
data = resort_snowfall,
ylim = c(0,50))
There appears to be a weak linear relationship if that. There are some cases where despite having over 40 lifts, the price is 0. This could be data entry errors or just unknown price for the lift ticket. Or it could somehow be a free resort, like a public ski resort.
# let's run a correlation
total_lifts_price_cor <- cor(resort_snowfall$Total.lifts, resort_snowfall$Price, use = "complete.obs")
print(total_lifts_price_cor, digits = 4)
## [1] 0.111
The correlation coefficient of 0.111 suggests a weak positive linear relationship between total lifts and price. My original assumption was that more lifts in a resort means more costs to run the resort, hence a more expensive ticket. However, using the correlation data here, there is not enough evidence to support this.
# let's run a covariance between total lifts and price
price <- resort_snowfall$Price
total_lifts <- resort_snowfall$Total.lifts
price_totallifts_cov <- cov(total_lifts, price)
print(price_totallifts_cov, digits = 4)
## [1] 66.74
A positive covariance generally means that as one variable increases, the other tends to increase. So as total lifts increase, price will likely increase. The magnitude of the covariance speaks to the strength of the relationship between price and total lifts, though because the units of price (euros) and total lifts (integers) are so different, it’s difficult to assess strength.