statistics-dec2019.utf8.md

#STATISTICS/ BASIC----

#**Descriptive statistics---- 
#Razlikujemo srednju vrijednost tj. mean. Ako imamo outliera koristimo median ili MAD. MAD je median absolut deviation. 
##Na MAD ne uti?e ni jedan outlier. On ra?una median i distance/razdaljinu od median. 

#Tako da mo?emo re?i, ako imamo tzv.skewed distribution/zakrivljenu distribuciju korisimo:
#1. meadian
#2. spearman corelation 
#3. MAD
#Ako imamo normalnu distribuciju koristimo: mean, Parson corelation i standardnu devijaciju. 

#da bismo znali da li nam je normalna ili zakrivljena distribucija koirstimo se vizuelnim tehnikama ili testovima

library(car) #paket za vizualno testiranje distribucije podataka

## Loading required package: carData

library(ggplot2)

WHO <- read.csv("WHO.csv")
ggplot(WHO, aes (x=LifeExpectancy)) + geom_density() #vidimo zakrivljenost

qqPlot(WHO$LifeExpectancy) # da je kriva po plavoj punoj liniji onda je normalna distribuicja, i unutar isprekidanih plavih, sve mimo toga zna?i da nije normalna distribucija

## [1] 154  33

#(or quantile-quantile plot) draws the correlation between a given sample and the normal distribution. A 45-degree reference line is also plotted

#testiranje normalnosti

##prije tuma?enja Shapiro testa bitno nam je da znamo ?ta je testiramo odnosno ?ta nam je nulta hipoteza.
#Nul hypothesis u Shapiro test je na?a distribucija je normalna. Ako nam p-value manji od 0.05 onda sa 95% sigurno??u odbacujemo nultu hipotezu, 
#tj. u ovom slu?aju zaklju?ujomo da se radi o zakljiveljnoj distribuciji tj. mi pretpostavljamo inormality. 

shapiro.test(WHO$LifeExpectancy)

## 
##  Shapiro-Wilk normality test
## 
## data:  WHO$LifeExpectancy
## W = 0.93077, p-value = 5.696e-08

#ako zelimo da uradimo descriptivnu statistiku za LifeExpectancy najjednostavnije je

summary(WHO$LifeExpectancy)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   47.00   64.00   72.50   70.01   76.00   83.00

#vidimo da nam jo? nedostaje MAD i sd u slu?aju normalne distribucije
mad(WHO$LifeExpectancy) #inormality distribution

## [1] 8.1543

sd (WHO$LifeExpectancy) #normality distribution

## [1] 9.259075

#prije nego pre?emo na obja?njenej kvintila i outliera na jednom primjeru sa manjim vektorom da vidimo ?ta u stvari zna?i median a ?ta mean.
# npr. imamo vektor podataka: 57,40,103,234,93,53,116,98,108,121,22 
##srednju vrijednost racunamo na nacin da saberemo sve ove podatke i podijelimo sa broj obzervacija tj.:

sum(c(57,40,103,234,93,53,116,98,108,121,22))/11 #dakle srednja vrijednost je 95

## [1] 95

#za medijanu je malo druga?ije: provo moramo poredati podatke od najmanje ka najve?oj: 22,40,53,57,93,98,103,108,116,121,234.

##medijana je srednja vrijednost - dale imamo 11 elemenata srednja vrijednost nam je ?esti element tj. 98. Dakle medijana je 98
median(c(57,40,103,234,93,53,116,98,108,121,22))

## [1] 98

##raspon je jo? jedna od mjera opisne statistike a to je razlika najve?e i najmanje vrijednosti u ovom slu?aju 234-22=221
#kroz funkciju u R dobiva se na na?in

range(c(57,40,103,234,93,53,116,98,108,121,22))

## [1]  22 234

#Interkvartalni raspon (IQR) je raspon izme?u prvog i tre?eg quartile. tj. sredina odnosno 50% distribucije (Q3-Q1)
#Q1 je medijana elemenata izmedju minimalne i medijane, dakle ono se ne ra?unaju! U ovom primjeru Q1 je:

(53+57)/2

## [1] 55

#Po istoj logici Q3 je:
(108+116)/2

## [1] 112

#kroz formulu provjerimo:
summary(c(57,40,103,234,93,53,116,98,108,121,22))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      22      55      98      95     112     234

#**outlayer----

##Postoji nepisano pravilo kako se ra?una outlier:1.5 x IQR - Q1 ili 1.5 x IQR + Q3
##u nasem primjeru to bi bilo 
55 - 1.5 * (112-55)  # dakle donja granica nam je -30.5, tako da sa "donje strane" nemamo outliera

## [1] -30.5

112 + 1.5 * (112-55)  #dakle sve ve?e od 197.5 je outlayer samim tim i vrijednosti od 234

## [1] 197.5

#u R mozemo to provjeriti i istovremeno i vizuelno i da nam "izbaci"
boxplot(c(57,40,103,234,93,53,116,98,108,121,22))$out

## [1] 234

#na isti na?in mozemo se vratit nasem primjeru i provjeriti outlier: ovo uradite sami


boxplot(WHO$LifeExpectancy)$out

## numeric(0)

#Varijansa (s2) : average squared deviance of each score from the mean.
varijansa = 32246/10  #3224.6 
#dakle svaki element npr 22 udaljen od srednje vrijednosti pa kvadriran, tj.
(22-95)^2 #5329, i tako za svaki elemenet (40-95)^2.. i to se sve sabere i podijeli sa n-1 tj. sa 11-1 tj. sa 10

## [1] 5329

#standardna devijacija se vise koriste i ona je drugi korijen iy varijanse tj. 

sqrt (3224.6) #56.79

## [1] 56.78556

#provjerimo i u R

var(c(57,40,103,234,93,53,116,98,108,121,22))

## [1] 3224.6

sd (c(57,40,103,234,93,53,116,98,108,121,22))

## [1] 56.78556

#prije korelacije moramo visualizirati podatak
##to je iz razloga sto veza izmedju dvije varijable moze biti paraboli?na a kad radimo corelaciju ona bude 0

#insert dataframe
mydataframe <- read.csv("mydf.csv")
cor(mydataframe$dohodak, mydataframe$godine) #korelacija izmedju x i y korelacija je 1

## [1] 1

plot(mydataframe$godine,mydataframe$dohodak) #na x osu ide nezavisna a na y zavisna varijabla

#ako dohodak formiramo na drugi na??in
mydataframe$dohodak1 <- sample(mydataframe$dohodak,12) #note. dobili smo cetvrtu varijablu
cor(mydataframe$dohodak1, mydataframe$godine) #dobivamo razlicit rezultat jer se radi o slucajnom uzorku

## [1] -0.04895105

plot(mydataframe$godine,mydataframe$dohodak1)

#VIZUELAN PRIKAZ KORELACIJE
vars <- c("dohodak","dohodak1", "godine") #moraju biti numericke varijable#iformal definition> 
#In probability theory and statistics, variance is the expectation of the squared deviation of a random variable from its mean. 
#Informally, it measures how far a set of (random) numbers are spread out from their average value. 
cormatrix <- cor(mydataframe[,vars])#formiramo korelacijsku matricu sa "odabranim" varijablama
library(corrplot) #pozovemo corrplot paket

## corrplot 0.84 loaded

corrplot(cormatrix)

##covariance 

#RAZUMJEVANJE MEAN, MEDIJANA, Q1, Q3
summary(mydataframe$dohodak1)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   490.9   868.2  1245.5  1245.5  1622.7  2000.0

mydataframe$dohodak1 # radi razumjevanja sta je q1 q3 i medijana

##  [1] 1862.8136  490.9500 1451.2545  902.5091 1039.6955 2000.0000 1314.0682
##  [8] 1176.8818  628.1364 1725.6273 1588.4409  765.3227

plot(density(mydataframe$dohodak1)) #vizuelni prikaz distribucije

mean (mydataframe$dohodak1)

## [1] 1245.475

median (mydataframe$dohodak1)

## [1] 1245.475

quantile (mydataframe$dohodak1) # procentni kvintili

##        0%       25%       50%       75%      100% 
##  490.9500  868.2125 1245.4750 1622.7375 2000.0000

var (mydataframe$dohodak1) # varijansa

## [1] 244661.3

sd(mydataframe$dohodak1) #standardna devijacija#note da je sd=sqrt(var) sqrt(244660.5)

## [1] 494.6325

#STATISTICAL TESTS----  
## test statistics = signal/noise = variance explained  by the model/variance not explaned by the model = effect/error
##the larger t is (or other statistics), the more likely you will reject Ho, since there is more signal than noise


#t distribution - can be used with any statistics having a bell shaped distribution. CLT states the sample distribution of a statistic will be close to normal with a large enough sample size. 
##As a rough estimate CLT predicts a roughly normal distribution under any of the following conditions:
##1. population distribution is normal; or
##2. sampling distribution is symetric and the sample size <=15; or
##3. sampling distribution is moderatly skewed and the sample size is 16<= n <= 30; or
##4. the sample size is >30 without outliers. 

#RULE: if p-value is less then alpha WE REJECT NULL HYPOTHESIS
#alpha = significance level of test

#ttest..sada cemo raditi samo kodove za testove----
#t.test(x, mu=30) #single sample test
#t.test (x,y) #independent t test #two samples are independent
#t.test (x1, x2, paired = TRUE) #dependent: this is used pre/post data when we apply lets say experiment
#t.test(pre, post, paired = T, alternative = "less") #one tailed dependent t test

#t test se upotrebljava kada imamo samo dva faktora
#ako ga izvodimo na WHO bazi vec znamo da imamo jednu factor varijablu i ona ima vise od dva nivoa
#radi toga prvo formiramo vektor

#example chi squere Tokyo
list.files()

##  [1] "Anova test of variance.R"                
##  [2] "boxplot.JPG"                             
##  [3] "Capture.JPG"                             
##  [4] "mydf.csv"                                
##  [5] "prvi_put_registrovana_vozila_12_2015.csv"
##  [6] "statistics-dec2019.html"                 
##  [7] "statistics-dec2019.R"                    
##  [8] "statistics-dec2019.spin.R"               
##  [9] "statistics-dec2019.spin.Rmd"             
## [10] "statistics dec2019.R"                    
## [11] "statistics.html"                         
## [12] "statistics.R"                            
## [13] "table1.txt"                              
## [14] "Tokyo_updated.xlsx"                      
## [15] "WHO.csv"

#probati samostalno na primjeru Tokyo_update da li se statisti?ki razlikuju geneder i sklonost u kupovini odredjene marke
Tokyo <- readxl::read_xlsx( "Tokyo_updated.xlsx")

## New names:
## * `` -> ...13

names(Tokyo)

##  [1] "Store"       "Brand"       "Type"        "Gender"      "Size"       
##  [6] "Color"       "Category"    "Sales Price" "Date"        "Time"       
## [11] "Loyalty"     "Month"       "...13"

#korak prvi napravimo dva vektora
#1. t.test
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:car':
## 
##     recode

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

SP.female <- Tokyo %>% filter(Gender== "Female") %>% select(`Sales Price`) %>% unlist ()
SP.male <- Tokyo %>% filter(Gender== "Male") %>% select(`Sales Price`) %>% unlist ()

t.test(SP.female, SP.male) #postoji statisticki znacajna razlika izmedju spolova. Zene u prosjeku trose 100.3 dolara a muskaci 125.9 dolara

## 
##  Welch Two Sample t-test
## 
## data:  SP.female and SP.male
## t = -10.862, df = 1017.8, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -30.20866 -20.96414
## sample estimates:
## mean of x mean of y 
##  100.3078  125.8942

#buduci da nam je p-value manji od 0.05 odbacujemo nultu hipotezu i prihvacamo alternaivnu 



#CHI SQUERE TEST----
##SUM(O-E)^2/E ...O is observed, E is expected
##this is goodnes of fit and it is used for cathegorical data

#ASSUMPIONS FOR CHI SQUERE TEST

##1. Random sample
##2. Indipendent observation for the sample (one observation per subject). This means one person can not be in both groups
##3. All expected counts are greater then 1 in each of our cells
##4. No more than 20% of cells with and expected counts are less then five

#STEPS IN CONDUCTING THE CHI SQUERE TEST
## 1. Clearly state the null and the hypothesis
## 2. Identify an appropriate test and significance level
## 3. Analyze sample data:
   ###3a Create a table to organize data
   ###3b Compare chi squer and alpha t test
## 4. Interpret the results

#goodness of fit tests if observed distribution is equal to expected distribution

#test of independence tests is variable x independent of variable y, not how they are related

#chisq example----
#Test of independence is more used and the steps are> - Tokyo
##1. first form a table
table1 <- table (Tokyo$Gender, Tokyo$Brand)

##2. then check the assumptions
#chisq.test(table1)$expected #if values are greater then 5 we are goot to go with testing 
chisq.test(table1)$expected

##         
##             Adidas    Asics     Nike
##   Female 102.88217 511.0297 296.0881
##   Male    52.34554 260.0074 150.6470
##   Unisex  57.77229 286.9628 166.2649

##3. then we do the test and intepret the results
#chisq.test(table1)
chisq.test(table1) #reject the null hypothesis

## 
##  Pearson's Chi-squared test
## 
## data:  table1
## X-squared = 400.88, df = 4, p-value < 2.2e-16

#Note: if we didn't satisfied the second assumptions then there is non/parametric way of solving it:
#chisq.test(table1, correct = T) #but we must say that we did it in the interpretation and methodology
chisq.test(table1, correct = T)

## 
##  Pearson's Chi-squared test
## 
## data:  table1
## X-squared = 400.88, df = 4, p-value < 2.2e-16

##and at the and the output of descriptive sttistics
library(stargazer)

## 
## Please cite as:

##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.

##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer

stargazer (WHO, type="text", title = "Descriptive statistics", digits = 1, out = "table1.txt")

## 
## Descriptive statistics
## ======================================================================================
## Statistic                      N    Mean   St. Dev.   Min  Pctl(25) Pctl(75)    Max   
## --------------------------------------------------------------------------------------
## Population                    194 36,360.0 137,903.1   1   1,695.8  24,535.2 1,390,000
## Under15                       194   28.7     10.5    13.1    18.7     37.8     50.0   
## Over60                        194   11.2      7.1     0.8    5.2      16.7     31.9   
## FertilityRate                 183   2.9       1.5     1.3    1.8      3.9       7.6   
## LifeExpectancy                194   70.0      9.3     47      64       76       83    
## ChildMortality                194   36.1     38.0     2.2    8.4      56.0     181.6  
## CellularSubscribers           184   93.6     41.4     2.6    63.6    120.8     196.4  
## LiteracyRate                  103   83.7     17.5    31.1    71.6     97.8     99.8   
## GNI                           162 13,320.9 15,193.0  340.0 2,335.0  17,557.5 86,440.0 
## PrimarySchoolEnrollmentMale   101   90.9     11.0    37.2    87.7     98.1     100.0  
## PrimarySchoolEnrollmentFemale 101   89.6     12.8    32.5    87.3     97.9     100.0  
## --------------------------------------------------------------------------------------

#it saves in the current working directory
#it calculates only variables that are as.numeric ()

#otvorite folder u kojem radite i vidie u njemu spasenu tabelu descriptivne statistike

statistics-dec2019.R

afeta

2019-12-10