This report presents an analysis of the tornado dataset focusing on various aspects including exploratory data analysis, visualization, and statistical modeling.
This report displays summary statistics of the tornado dataset.
# Load or define dataset_al_tornado
dataset_al_tornado <- read.csv("C:\\Users\\Ross\\Documents\\RDatasets\\dataset_al_tornado.csv")
# Display summary statistics of the dataset
summary(dataset_al_tornado)
## Tornado...in.Year Year Month Day
## Min. : 1.00 Min. :1794 Min. : 1.000 Min. : 1.00
## 1st Qu.: 5.00 1st Qu.:1961 1st Qu.: 3.000 1st Qu.: 8.00
## Median : 14.00 Median :1991 Median : 4.000 Median :17.00
## Mean : 21.67 Mean :1977 Mean : 5.545 Mean :16.61
## 3rd Qu.: 30.00 3rd Qu.:2008 3rd Qu.: 9.000 3rd Qu.:24.00
## Max. :145.00 Max. :2018 Max. :12.000 Max. :31.00
## NA's :11 NA's :16
## Time..CST. County Damage.Scale
## Length:2663 Length:2663 Length:2663
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Maximum.Path.Width..Yards. Path.Length..Miles. Fatalities
## Min. : 0.0 Length:2663 Length:2663
## 1st Qu.: 30.0 Class :character Class :character
## Median : 100.0 Mode :character Mode :character
## Mean : 176.3
## 3rd Qu.: 200.0
## Max. :2600.0
## NA's :136
## Injuries Location X
## Length:2663 Length:2663 Mode:logical
## Class :character Class :character NA's:2663
## Mode :character Mode :character
##
##
##
##
library(ggplot2)
tornado_counts <- aggregate(Tornado...in.Year ~ Year, data = dataset_al_tornado, FUN = length)
ggplot(dataset_al_tornado, aes(x = factor(Month), fill = Damage.Scale)) +
geom_bar() +
labs(title = "Tornado Occurrences by Month", x = "Month", y = "Tornado Count", fill = "Damage Scale") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Summary statistics of the Damage Scale
summary(dataset_al_tornado$Damage.Scale)
## Length Class Mode
## 2663 character character
ggplot(tornado_counts, aes(x = Year, y = Tornado...in.Year)) +
geom_line() +
labs(title = "Tornado Frequency Over Years", x = "Year", y = "Tornado Count")
damage_counts <- aggregate(Tornado...in.Year ~ Damage.Scale, data = dataset_al_tornado, FUN = length)
ggplot(damage_counts, aes(x = Damage.Scale, y = Tornado...in.Year)) +
geom_bar(stat = "identity") +
labs(title = "Tornado Count by Damage Scale", x = "Damage Scale", y = "Tornado Count")
model <- lm(Tornado...in.Year ~ Month + Damage.Scale, data = dataset_al_tornado)
# Summary of the regression model
summary(model)
##
## Call:
## lm(formula = Tornado...in.Year ~ Month + Damage.Scale, data = dataset_al_tornado)
##
## Residuals:
## Min 1Q Median 3Q Max
## -35.278 -11.462 -4.429 4.825 108.193
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.4202 1.0861 11.435 < 2e-16 ***
## Month 2.6887 0.1211 22.198 < 2e-16 ***
## Damage.ScaleF1 -1.9205 1.0509 -1.827 0.0678 .
## Damage.ScaleF2 -12.0571 1.1007 -10.954 < 2e-16 ***
## Damage.ScaleF3 -10.5127 1.6224 -6.480 1.1e-10 ***
## Damage.ScaleF4 -3.7180 2.6674 -1.394 0.1635
## Damage.ScaleF5 12.4109 7.2960 1.701 0.0891 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.5 on 2523 degrees of freedom
## (133 observations deleted due to missingness)
## Multiple R-squared: 0.2266, Adjusted R-squared: 0.2248
## F-statistic: 123.2 on 6 and 2523 DF, p-value: < 2.2e-16