library(datasets)
library(moments)
library(vioplot)
data("ChickWeight")
data <- airquality
#a
max_values <- sapply(data, max, na.rm = TRUE)
max_variable <- names(which.max(max_values))
max_variable
## [1] "Solar.R"
#b
missing_vars <- names(data)[sapply(data, function(x) any(is.na(x)))]
missing_vars
## [1] "Ozone" "Solar.R"
airquality_clean <- na.omit(data)
#airquality_clean odkoment
#c
sk_values <- sapply(airquality_clean,skewness)
max_sk_var <- names(which.max(sk_values))
max_sk_var
## [1] "Ozone"
#d
hist(airquality_clean[[max_sk_var]], main = paste("Hist: ",max_sk_var))
boxplot(airquality_clean[[max_sk_var]], main = paste("Boxplot: ",max_sk_var))
vioplot(airquality_clean[[max_sk_var]], names = max_sk_var, main = paste("Violinplot:", max_sk_var))
Histogram ukazuje posun doľava alebo doprava. Zosikmenie premennej (skewness) môžeme na histograme identifikovať podľa toho, ako je rozdelenie hodnôt “naklonené”. Ak je distribúcia šikmá na pravú stranu (prítomnosť dlhšieho chvosta na pravej strane), hovoríme o pravej šikmosti (positívna šikmosť). Naopak, ak je rozdelenie šikmé na ľavú stranu (dlhší chvost na ľavej strane), ide o ľavú šikmosť (negatívna šikmosť). Ak je histogram symetrický, znamená to, že distribúcia je blízka normálnemu rozdeleniu a sklon je nula.
Na boxplote si môžete všimnúť asymetriu fúzov alebo umiestnenie mediánu. Na boxplote môžeme tiež pozorovať šikmosť. Ak sú “us” (whiskers) na jednej strane boxu dlhšie než na druhej strane, môže to naznačovať šikmosť. Ďalším indikátorom je pozícia mediány v rámci boxu: ak je mediána bližšie k spodnej alebo hornej časti boxu, môže to naznačovať sklon distribúcie.
#e zošikmenie premennej vieme to urcit podla grafov:
Histogram: Zošikmenie (skewness) premennej je viditelne podla asymetrie rozdelenia: Napr. ak ma histogram dlhy chvost vpravo (pozitivne zosikmenie), vacsina hodnot je koncentrovana vlavo.
Boxplot: Na boxplote je zosikmenie viditelne z pozicie mediany a dlzky fuzov: Dlhsi fuz na jednej strane naznacuje smer zosikmenia.
Violinplot: Tento graf zobrazuje hustotu rozdelenia. Ako aj v boxplote jeden z chvostov je dlhsi - to naznacuje zosikmenie.
data1 <- ToothGrowth
#a
sp_result <- sapply(data1, function(x) if (is.numeric(x)) shapiro.test(x)$p.value else NA)
normal_var <- names(sp_result)[which(sp_result > 0.1)]
normal_var
## [1] "len"
#b
t.test(data1[[normal_var]],mu=20)
##
## One Sample t-test
##
## data: data1[[normal_var]]
## t = -1.2017, df = 59, p-value = 0.2343
## alternative hypothesis: true mean is not equal to 20
## 95 percent confidence interval:
## 16.83731 20.78936
## sample estimates:
## mean of x
## 18.81333
#c
t.test(data1[[normal_var]] ~ data1$supp, var.equal = TRUE)
##
## Two Sample t-test
##
## data: data1[[normal_var]] by data1$supp
## t = 1.9153, df = 58, p-value = 0.06039
## alternative hypothesis: true difference in means between group OJ and group VC is not equal to 0
## 95 percent confidence interval:
## -0.1670064 7.5670064
## sample estimates:
## mean in group OJ mean in group VC
## 20.66333 16.96333
t.test(data1[[normal_var]] ~ data1$supp, var.equal = TRUE, conf.level = 0.9)
##
## Two Sample t-test
##
## data: data1[[normal_var]] by data1$supp
## t = 1.9153, df = 58, p-value = 0.06039
## alternative hypothesis: true difference in means between group OJ and group VC is not equal to 0
## 90 percent confidence interval:
## 0.4708204 6.9291796
## sample estimates:
## mean in group OJ mean in group VC
## 20.66333 16.96333
data2 <- ChickWeight
#a
tapply(data2$weight, data2$Diet, shapiro.test)
## $`1`
##
## Shapiro-Wilk normality test
##
## data: X[[i]]
## W = 0.89336, p-value = 2.211e-11
##
##
## $`2`
##
## Shapiro-Wilk normality test
##
## data: X[[i]]
## W = 0.90399, p-value = 3.159e-07
##
##
## $`3`
##
## Shapiro-Wilk normality test
##
## data: X[[i]]
## W = 0.91646, p-value = 1.509e-06
##
##
## $`4`
##
## Shapiro-Wilk normality test
##
## data: X[[i]]
## W = 0.95437, p-value = 0.0005195
bartlett.test(data2$weight, data2$Diet)
##
## Bartlett test of homogeneity of variances
##
## data: data2$weight and data2$Diet
## Bartlett's K-squared = 29.489, df = 3, p-value = 1.768e-06
#b
cat("\tAnova:\n\n")
## Anova:
anova_model <- aov(weight ~ Diet, data2)
summary(anova_model)
## Df Sum Sq Mean Sq F value Pr(>F)
## Diet 3 155863 51954 10.81 6.43e-07 ***
## Residuals 574 2758693 4806
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p < 0.05 - nejaka Dieta ma vacsi vplyv na weight
#c
cat("\n\tTukey:\n\n")
##
## Tukey:
TukeyHSD(anova_model)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = weight ~ Diet, data = data2)
##
## $Diet
## diff lwr upr p adj
## 2-1 19.971212 -0.2998092 40.24223 0.0552271
## 3-1 40.304545 20.0335241 60.57557 0.0000025
## 4-1 32.617257 12.2353820 52.99913 0.0002501
## 3-2 20.333333 -2.7268370 43.39350 0.1058474
## 4-2 12.646045 -10.5116315 35.80372 0.4954239
## 4-3 -7.687288 -30.8449649 15.47039 0.8277810
#library(agricolae)
#ansch<-aov(weight ~ Diet, ChickWeight)
#result<- scheffe.test(ansch,"Diet", alpha = 0.05)
#result
#c Ak je p-hodnota mensia nez 0,05 tak medzi tymto dvoma urovnami faktora je statisticky vyznamny rozdiel. Vysledky TukeyHSD poskytnu konkrétne pary, medzi ktorymi existuje vyznamny rozdiel v zavislosti od hodnoty p. Nasli sme ze 4 dieta ma najvacsi vplyv.
data4 <- airquality_clean
#a
cor_matrix <- cor(data4)
ozone_cor <- cor_matrix["Ozone",]
strngst_pos <- names(which.max(ozone_cor[ozone_cor<1]))
strngst_neg <- names(which.min(ozone_cor))
strngst_pos
## [1] "Temp"
strngst_neg
## [1] "Wind"
#b
spearman_cor <- cor(data4, method = "spearman")
kendall_cor <- cor(data4, method = "kendall")
spearman_cor["Ozone",]
## Ozone Solar.R Wind Temp Month Day
## 1.00000000 0.34818647 -0.60513642 0.77293193 0.11669011 -0.03504654
kendall_cor["Ozone",]
## Ozone Solar.R Wind Temp Month Day
## 1.00000000 0.24031942 -0.44045944 0.58614712 0.08859401 -0.03041526
#c
plot(data4$Ozone, data4[[strngst_pos]], main = "Ozone vs strongest Positive")
plot(data4$Ozone, data4[[strngst_neg]], main = "Ozone vs strongest Negative")
#main = paste("mpg vs", strongest_pos)
data5 <- ChickWeight
#a
linear_model <- lm(weight ~ Time + Diet, data5)
summary(linear_model)$adj.r.squared
## [1] 0.7435224
#summary(linear_model) odkoment
#b
full_model <- lm(weight ~ . , data = data5)
summary(full_model)$adj.r.squared
## [1] 0.8416462
#summary(full_model) odkoment
#c
AIC(linear_model,full_model)
## df AIC
## linear_model 6 5789.607
## full_model 52 5554.520
BIC(linear_model,full_model)
## df BIC
## linear_model 6 5815.765
## full_model 52 5781.218
#d
linear_model
##
## Call:
## lm(formula = weight ~ Time + Diet, data = data5)
##
## Coefficients:
## (Intercept) Time Diet2 Diet3 Diet4
## 10.92 8.75 16.17 36.50 30.23
full_model
##
## Call:
## lm(formula = weight ~ ., data = data5)
##
## Coefficients:
## (Intercept) Time Chick.L Chick.Q Chick.C Chick^4
## 27.8983 8.7152 130.2672 -24.9790 -16.2937 16.8300
## Chick^5 Chick^6 Chick^7 Chick^8 Chick^9 Chick^10
## -11.8874 62.0273 -5.7763 -2.1871 -2.5741 -15.8198
## Chick^11 Chick^12 Chick^13 Chick^14 Chick^15 Chick^16
## -29.2864 12.6115 45.5732 8.4847 -36.7921 -18.5430
## Chick^17 Chick^18 Chick^19 Chick^20 Chick^21 Chick^22
## 13.8154 19.2098 -15.1139 19.1073 10.3924 -13.8848
## Chick^23 Chick^24 Chick^25 Chick^26 Chick^27 Chick^28
## -2.6091 3.7279 -30.7405 22.9762 37.2504 11.6756
## Chick^29 Chick^30 Chick^31 Chick^32 Chick^33 Chick^34
## -15.8480 -9.6065 2.6487 15.8318 -4.2143 -8.3473
## Chick^35 Chick^36 Chick^37 Chick^38 Chick^39 Chick^40
## 0.6771 -17.3659 -2.6001 -15.5406 -2.8088 39.4063
## Chick^41 Chick^42 Chick^43 Chick^44 Chick^45 Chick^46
## -15.4934 -27.7127 -32.3320 -9.6068 -12.8100 21.7381
## Chick^47 Chick^48 Chick^49 Diet2 Diet3 Diet4
## 6.3382 16.3989 -25.3853 NA NA NA
best_model <- if (AIC(linear_model) < AIC(full_model)) {
linear_model
} else {
full_model
}
library(randtests)
residuals_best_model <- residuals(best_model)
shapiro.test(residuals_best_model)
##
## Shapiro-Wilk normality test
##
## data: residuals_best_model
## W = 0.98127, p-value = 9.211e-07
runs.test(residuals_best_model)
##
## Runs Test
##
## data: residuals_best_model
## statistic = -15.903, runs = 99, n1 = 289, n2 = 289, n = 578, p-value <
## 2.2e-16
## alternative hypothesis: nonrandomness
R^2 ide k 1 p_value < 0.05 (<alpha) Residual musi byt symetricke Error ide k 0
Shap: nie normalne rozdelenie (p<alpha) Runs: nie randomne