Importing the data

Začni z data (CTRL+ENTER)

data()

data(package = .packages(all.available = TRUE))
mydata <- force(USArrests)

For homework: Explanation of variables:

Descriptive statistics:

summary(mydata)
##      Murder          Assault         UrbanPop          Rape      
##  Min.   : 0.800   Min.   : 45.0   Min.   :32.00   Min.   : 7.30  
##  1st Qu.: 4.075   1st Qu.:109.0   1st Qu.:54.50   1st Qu.:15.07  
##  Median : 7.250   Median :159.0   Median :66.00   Median :20.10  
##  Mean   : 7.788   Mean   :170.8   Mean   :65.54   Mean   :21.23  
##  3rd Qu.:11.250   3rd Qu.:249.0   3rd Qu.:77.75   3rd Qu.:26.18  
##  Max.   :17.400   Max.   :337.0   Max.   :91.00   Max.   :46.00
library(pastecs)
round(stat.desc(mydata), 2)
##              Murder Assault UrbanPop    Rape
## nbr.val       50.00   50.00    50.00   50.00
## nbr.null       0.00    0.00     0.00    0.00
## nbr.na         0.00    0.00     0.00    0.00
## min            0.80   45.00    32.00    7.30
## max           17.40  337.00    91.00   46.00
## range         16.60  292.00    59.00   38.70
## sum          389.40 8538.00  3277.00 1061.60
## median         7.25  159.00    66.00   20.10
## mean           7.79  170.76    65.54   21.23
## SE.mean        0.62   11.79     2.05    1.32
## CI.mean.0.95   1.24   23.68     4.11    2.66
## var           18.97 6945.17   209.52   87.73
## std.dev        4.36   83.34    14.47    9.37
## coef.var       0.56    0.49     0.22    0.44

Importing the data 2

#install.packages("psych") #Intsalling package
library(psych) #Activating package
mydata <- force(sat.act) #Importing the data set

head(mydata)
##       gender education age ACT SATV SATQ
## 29442      2         3  19  24  500  500
## 29457      2         3  23  35  600  500
## 29498      2         3  20  21  480  470
## 29503      1         4  27  26  550  520
## 29504      1         2  33  31  600  550
## 29518      1         5  26  28  640  640
colnames(mydata)[4] <- "Highschool"  #Renaming a variable - try using one word

head(mydata) #Da ti spodi pokaže tabele
##       gender education age Highschool SATV SATQ
## 29442      2         3  19         24  500  500
## 29457      2         3  23         35  600  500
## 29498      2         3  20         21  480  470
## 29503      1         4  27         26  550  520
## 29504      1         2  33         31  600  550
## 29518      1         5  26         28  640  640
mean(mydata$gender)
## [1] 1.647143

This does not make sense. Try this:

mydata$genderF <- factor(mydata$gender,
                         levels = c(1, 2),
                         labels = c("M", "F")) #Tri pogoji - funkcija, originalne, kako spremeni
summary(mydata)
##      gender        education          age          Highschool   
##  Min.   :1.000   Min.   :0.000   Min.   :13.00   Min.   : 3.00  
##  1st Qu.:1.000   1st Qu.:3.000   1st Qu.:19.00   1st Qu.:25.00  
##  Median :2.000   Median :3.000   Median :22.00   Median :29.00  
##  Mean   :1.647   Mean   :3.164   Mean   :25.59   Mean   :28.55  
##  3rd Qu.:2.000   3rd Qu.:4.000   3rd Qu.:29.00   3rd Qu.:32.00  
##  Max.   :2.000   Max.   :5.000   Max.   :65.00   Max.   :36.00  
##                                                                 
##       SATV            SATQ       genderF
##  Min.   :200.0   Min.   :200.0   M:247  
##  1st Qu.:550.0   1st Qu.:530.0   F:453  
##  Median :620.0   Median :620.0          
##  Mean   :612.2   Mean   :610.2          
##  3rd Qu.:700.0   3rd Qu.:700.0          
##  Max.   :800.0   Max.   :800.0          
##                  NA's   :13

Kaj je reported za gender reported le frequencies.

Factor je categorical variable

Če mas not availables (za SATQs jih je 13):

mean(mydata$SATQ)
## [1] NA

How can I get rid of non availables?

#install.packages("tidyr")
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:pastecs':
## 
##     extract
mydata_clean <- drop_na(mydata)

summary(mydata_clean)
##      gender        education          age          Highschool   
##  Min.   :1.000   Min.   :0.000   Min.   :13.00   Min.   : 3.00  
##  1st Qu.:1.000   1st Qu.:3.000   1st Qu.:19.00   1st Qu.:25.00  
##  Median :2.000   Median :3.000   Median :22.00   Median :29.00  
##  Mean   :1.643   Mean   :3.172   Mean   :25.64   Mean   :28.55  
##  3rd Qu.:2.000   3rd Qu.:4.000   3rd Qu.:29.00   3rd Qu.:32.00  
##  Max.   :2.000   Max.   :5.000   Max.   :65.00   Max.   :36.00  
##       SATV            SATQ       genderF
##  Min.   :200.0   Min.   :200.0   M:245  
##  1st Qu.:550.0   1st Qu.:530.0   F:442  
##  Median :620.0   Median :620.0          
##  Mean   :612.3   Mean   :610.2          
##  3rd Qu.:700.0   3rd Qu.:700.0          
##  Max.   :800.0   Max.   :800.0
mean(mydata_clean$SATQ)
## [1] 610.2169

Exercise for us: Create mydata2, which includes only students aged between 20 and 25 (included)

mydata2 <- mydata_clean[mydata_clean$age >= 20 & mydata_clean$age <= 25, ] #ne pozabi na vejico ko imaš kvadratne oklepaje!!!
#install.packages("dplyr")
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:pastecs':
## 
##     first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
mydata3 <- mydata_clean %>%
  filter(age >= 20)
#install.packages("dplyr")
library(dplyr)

mydata3 <- mydata_clean %>%
  filter(age >= 20 & age <= 25)

Separate descriptive statistics by groups of units:

describeBy(mydata$SATV, group = mydata$genderF)
## 
##  Descriptive statistics by group 
## group: M
##    vars   n   mean     sd median trimmed    mad min max range  skew kurtosis
## X1    1 247 615.11 114.16    630  622.07 118.61 200 800   600 -0.63     0.13
##      se
## X1 7.26
## ------------------------------------------------------------ 
## group: F
##    vars   n   mean     sd median trimmed    mad min max range  skew kurtosis
## X1    1 453 610.66 112.31    620  617.91 103.78 200 800   600 -0.65     0.42
##      se
## X1 5.28