#STATISTICS/ BASIC----
#**Descriptive statistics----
#Razlikujemo srednju vrijednost tj. mean. Ako imamo outliera koristimo median ili MAD. MAD je median absolut deviation.
##Na MAD ne uti?e ni jedan outlier. On ra?una median i distance/razdaljinu od median.
#Tako da mo?emo re?i, ako imamo tzv.skewed distribution/zakrivljenu distribuciju korisimo:
#1. meadian
#2. spearman corelation
#3. MAD
#Ako imamo normalnu distribuciju koristimo: mean, Parson corelation i standardnu devijaciju.
#da bismo znali da li nam je normalna ili zakrivljena distribucija koirstimo se vizuelnim tehnikama ili testovima
library(car) #paket za vizualno testiranje distribucije podataka
## Loading required package: carData
library(ggplot2)
WHO <- read.csv("WHO.csv")
ggplot(WHO, aes (x=LifeExpectancy)) + geom_density() #vidimo zakrivljenost

qqPlot(WHO$LifeExpectancy) # da je kriva po plavoj punoj liniji onda je normalna distribuicja, i unutar isprekidanih plavih, sve mimo toga zna?i da nije normalna distribucija

## [1] 154 33
#(or quantile-quantile plot) draws the correlation between a given sample and the normal distribution. A 45-degree reference line is also plotted
#testiranje normalnosti
##prije tuma?enja Shapiro testa bitno nam je da znamo ?ta je testiramo odnosno ?ta nam je nulta hipoteza.
#Nul hypothesis u Shapiro test je na?a distribucija je normalna. Ako nam p-value manji od 0.05 onda sa 95% sigurno??u odbacujemo nultu hipotezu,
#tj. u ovom slu?aju zaklju?ujomo da se radi o zakljiveljnoj distribuciji tj. mi pretpostavljamo inormality.
shapiro.test(WHO$LifeExpectancy)
##
## Shapiro-Wilk normality test
##
## data: WHO$LifeExpectancy
## W = 0.93077, p-value = 5.696e-08
#ako zelimo da uradimo descriptivnu statistiku za LifeExpectancy najjednostavnije je
summary(WHO$LifeExpectancy)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 47.00 64.00 72.50 70.01 76.00 83.00
#vidimo da nam jo? nedostaje MAD i sd u slu?aju normalne distribucije
mad(WHO$LifeExpectancy) #inormality distribution
## [1] 8.1543
sd (WHO$LifeExpectancy) #normality distribution
## [1] 9.259075
#prije nego pre?emo na obja?njenej kvintila i outliera na jednom primjeru sa manjim vektorom da vidimo ?ta u stvari zna?i median a ?ta mean.
# npr. imamo vektor podataka: 57,40,103,234,93,53,116,98,108,121,22
##srednju vrijednost racunamo na nacin da saberemo sve ove podatke i podijelimo sa broj obzervacija tj.:
sum(c(57,40,103,234,93,53,116,98,108,121,22))/11 #dakle srednja vrijednost je 95
## [1] 95
#za medijanu je malo druga?ije: provo moramo poredati podatke od najmanje ka najve?oj: 22,40,53,57,93,98,103,108,116,121,234.
##medijana je srednja vrijednost - dale imamo 11 elemenata srednja vrijednost nam je ?esti element tj. 98. Dakle medijana je 98
median(c(57,40,103,234,93,53,116,98,108,121,22))
## [1] 98
##raspon je jo? jedna od mjera opisne statistike a to je razlika najve?e i najmanje vrijednosti u ovom slu?aju 234-22=221
#kroz funkciju u R dobiva se na na?in
range(c(57,40,103,234,93,53,116,98,108,121,22))
## [1] 22 234
#Interkvartalni raspon (IQR) je raspon izme?u prvog i tre?eg quartile. tj. sredina odnosno 50% distribucije (Q3-Q1)
#Q1 je medijana elemenata izmedju minimalne i medijane, dakle ono se ne ra?unaju! U ovom primjeru Q1 je:
(53+57)/2
## [1] 55
#Po istoj logici Q3 je:
(108+116)/2
## [1] 112
#kroz formulu provjerimo:
summary(c(57,40,103,234,93,53,116,98,108,121,22))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 22 55 98 95 112 234
#**outlayer----
##Postoji nepisano pravilo kako se ra?una outlier:1.5 x IQR - Q1 ili 1.5 x IQR + Q3
##u nasem primjeru to bi bilo
55 - 1.5 * (112-55) # dakle donja granica nam je -30.5, tako da sa "donje strane" nemamo outliera
## [1] -30.5
112 + 1.5 * (112-55) #dakle sve ve?e od 197.5 je outlayer samim tim i vrijednosti od 234
## [1] 197.5
#u R mozemo to provjeriti i istovremeno i vizuelno i da nam "izbaci"
boxplot(c(57,40,103,234,93,53,116,98,108,121,22))$out

## [1] 234
#na isti na?in mozemo se vratit nasem primjeru i provjeriti outlier: ovo uradite sami
boxplot(WHO$LifeExpectancy)$out

## numeric(0)
#Varijansa (s2) : average squared deviance of each score from the mean.
varijansa = 32246/10 #3224.6
#dakle svaki element npr 22 udaljen od srednje vrijednosti pa kvadriran, tj.
(22-95)^2 #5329, i tako za svaki elemenet (40-95)^2.. i to se sve sabere i podijeli sa n-1 tj. sa 11-1 tj. sa 10
## [1] 5329
#standardna devijacija se vise koriste i ona je drugi korijen iy varijanse tj.
sqrt (3224.6) #56.79
## [1] 56.78556
#provjerimo i u R
var(c(57,40,103,234,93,53,116,98,108,121,22))
## [1] 3224.6
sd (c(57,40,103,234,93,53,116,98,108,121,22))
## [1] 56.78556
#prije korelacije moramo visualizirati podatak
##to je iz razloga sto veza izmedju dvije varijable moze biti paraboli?na a kad radimo corelaciju ona bude 0
#insert dataframe
mydataframe <- read.csv("mydf.csv")
cor(mydataframe$dohodak, mydataframe$godine) #korelacija izmedju x i y korelacija je 1
## [1] 1
plot(mydataframe$godine,mydataframe$dohodak) #na x osu ide nezavisna a na y zavisna varijabla

#ako dohodak formiramo na drugi na??in
mydataframe$dohodak1 <- sample(mydataframe$dohodak,12) #note. dobili smo cetvrtu varijablu
cor(mydataframe$dohodak1, mydataframe$godine) #dobivamo razlicit rezultat jer se radi o slucajnom uzorku
## [1] -0.04895105
plot(mydataframe$godine,mydataframe$dohodak1)

#VIZUELAN PRIKAZ KORELACIJE
vars <- c("dohodak","dohodak1", "godine") #moraju biti numericke varijable#iformal definition>
#In probability theory and statistics, variance is the expectation of the squared deviation of a random variable from its mean.
#Informally, it measures how far a set of (random) numbers are spread out from their average value.
cormatrix <- cor(mydataframe[,vars])#formiramo korelacijsku matricu sa "odabranim" varijablama
library(corrplot) #pozovemo corrplot paket
## corrplot 0.84 loaded
corrplot(cormatrix)

##covariance
#RAZUMJEVANJE MEAN, MEDIJANA, Q1, Q3
summary(mydataframe$dohodak1)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 490.9 868.2 1245.5 1245.5 1622.7 2000.0
mydataframe$dohodak1 # radi razumjevanja sta je q1 q3 i medijana
## [1] 1862.8136 490.9500 1451.2545 902.5091 1039.6955 2000.0000 1314.0682
## [8] 1176.8818 628.1364 1725.6273 1588.4409 765.3227
plot(density(mydataframe$dohodak1)) #vizuelni prikaz distribucije

mean (mydataframe$dohodak1)
## [1] 1245.475
median (mydataframe$dohodak1)
## [1] 1245.475
quantile (mydataframe$dohodak1) # procentni kvintili
## 0% 25% 50% 75% 100%
## 490.9500 868.2125 1245.4750 1622.7375 2000.0000
var (mydataframe$dohodak1) # varijansa
## [1] 244661.3
sd(mydataframe$dohodak1) #standardna devijacija#note da je sd=sqrt(var) sqrt(244660.5)
## [1] 494.6325
#STATISTICAL TESTS----
## test statistics = signal/noise = variance explained by the model/variance not explaned by the model = effect/error
##the larger t is (or other statistics), the more likely you will reject Ho, since there is more signal than noise
#t distribution - can be used with any statistics having a bell shaped distribution. CLT states the sample distribution of a statistic will be close to normal with a large enough sample size.
##As a rough estimate CLT predicts a roughly normal distribution under any of the following conditions:
##1. population distribution is normal; or
##2. sampling distribution is symetric and the sample size <=15; or
##3. sampling distribution is moderatly skewed and the sample size is 16<= n <= 30; or
##4. the sample size is >30 without outliers.
#RULE: if p-value is less then alpha WE REJECT NULL HYPOTHESIS
#alpha = significance level of test
#ttest..sada cemo raditi samo kodove za testove----
#t.test(x, mu=30) #single sample test
#t.test (x,y) #independent t test #two samples are independent
#t.test (x1, x2, paired = TRUE) #dependent: this is used pre/post data when we apply lets say experiment
#t.test(pre, post, paired = T, alternative = "less") #one tailed dependent t test
#t test se upotrebljava kada imamo samo dva faktora
#ako ga izvodimo na WHO bazi vec znamo da imamo jednu factor varijablu i ona ima vise od dva nivoa
#radi toga prvo formiramo vektor
#example chi squere Tokyo
list.files()
## [1] "Anova test of variance.R"
## [2] "boxplot.JPG"
## [3] "Capture.JPG"
## [4] "mydf.csv"
## [5] "prvi_put_registrovana_vozila_12_2015.csv"
## [6] "statistics-dec2019.html"
## [7] "statistics-dec2019.R"
## [8] "statistics-dec2019.spin.R"
## [9] "statistics-dec2019.spin.Rmd"
## [10] "statistics dec2019.R"
## [11] "statistics.html"
## [12] "statistics.R"
## [13] "table1.txt"
## [14] "Tokyo_updated.xlsx"
## [15] "WHO.csv"
#probati samostalno na primjeru Tokyo_update da li se statisti?ki razlikuju geneder i sklonost u kupovini odredjene marke
Tokyo <- readxl::read_xlsx( "Tokyo_updated.xlsx")
## New names:
## * `` -> ...13
names(Tokyo)
## [1] "Store" "Brand" "Type" "Gender" "Size"
## [6] "Color" "Category" "Sales Price" "Date" "Time"
## [11] "Loyalty" "Month" "...13"
#korak prvi napravimo dva vektora
#1. t.test
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
SP.female <- Tokyo %>% filter(Gender== "Female") %>% select(`Sales Price`) %>% unlist ()
SP.male <- Tokyo %>% filter(Gender== "Male") %>% select(`Sales Price`) %>% unlist ()
t.test(SP.female, SP.male) #postoji statisticki znacajna razlika izmedju spolova. Zene u prosjeku trose 100.3 dolara a muskaci 125.9 dolara
##
## Welch Two Sample t-test
##
## data: SP.female and SP.male
## t = -10.862, df = 1017.8, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -30.20866 -20.96414
## sample estimates:
## mean of x mean of y
## 100.3078 125.8942
#buduci da nam je p-value manji od 0.05 odbacujemo nultu hipotezu i prihvacamo alternaivnu
#CHI SQUERE TEST----
##SUM(O-E)^2/E ...O is observed, E is expected
##this is goodnes of fit and it is used for cathegorical data
#ASSUMPIONS FOR CHI SQUERE TEST
##1. Random sample
##2. Indipendent observation for the sample (one observation per subject). This means one person can not be in both groups
##3. All expected counts are greater then 1 in each of our cells
##4. No more than 20% of cells with and expected counts are less then five
#STEPS IN CONDUCTING THE CHI SQUERE TEST
## 1. Clearly state the null and the hypothesis
## 2. Identify an appropriate test and significance level
## 3. Analyze sample data:
###3a Create a table to organize data
###3b Compare chi squer and alpha t test
## 4. Interpret the results
#goodness of fit tests if observed distribution is equal to expected distribution
#test of independence tests is variable x independent of variable y, not how they are related
#chisq example----
#Test of independence is more used and the steps are> - Tokyo
##1. first form a table
table1 <- table (Tokyo$Gender, Tokyo$Brand)
##2. then check the assumptions
#chisq.test(table1)$expected #if values are greater then 5 we are goot to go with testing
chisq.test(table1)$expected
##
## Adidas Asics Nike
## Female 102.88217 511.0297 296.0881
## Male 52.34554 260.0074 150.6470
## Unisex 57.77229 286.9628 166.2649
##3. then we do the test and intepret the results
#chisq.test(table1)
chisq.test(table1) #reject the null hypothesis
##
## Pearson's Chi-squared test
##
## data: table1
## X-squared = 400.88, df = 4, p-value < 2.2e-16
#Note: if we didn't satisfied the second assumptions then there is non/parametric way of solving it:
#chisq.test(table1, correct = T) #but we must say that we did it in the interpretation and methodology
chisq.test(table1, correct = T)
##
## Pearson's Chi-squared test
##
## data: table1
## X-squared = 400.88, df = 4, p-value < 2.2e-16
##and at the and the output of descriptive sttistics
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
stargazer (WHO, type="text", title = "Descriptive statistics", digits = 1, out = "table1.txt")
##
## Descriptive statistics
## ======================================================================================
## Statistic N Mean St. Dev. Min Pctl(25) Pctl(75) Max
## --------------------------------------------------------------------------------------
## Population 194 36,360.0 137,903.1 1 1,695.8 24,535.2 1,390,000
## Under15 194 28.7 10.5 13.1 18.7 37.8 50.0
## Over60 194 11.2 7.1 0.8 5.2 16.7 31.9
## FertilityRate 183 2.9 1.5 1.3 1.8 3.9 7.6
## LifeExpectancy 194 70.0 9.3 47 64 76 83
## ChildMortality 194 36.1 38.0 2.2 8.4 56.0 181.6
## CellularSubscribers 184 93.6 41.4 2.6 63.6 120.8 196.4
## LiteracyRate 103 83.7 17.5 31.1 71.6 97.8 99.8
## GNI 162 13,320.9 15,193.0 340.0 2,335.0 17,557.5 86,440.0
## PrimarySchoolEnrollmentMale 101 90.9 11.0 37.2 87.7 98.1 100.0
## PrimarySchoolEnrollmentFemale 101 89.6 12.8 32.5 87.3 97.9 100.0
## --------------------------------------------------------------------------------------
#it saves in the current working directory
#it calculates only variables that are as.numeric ()
#otvorite folder u kojem radite i vidie u njemu spasenu tabelu descriptivne statistike