WQD7001 - Tutorial 5 EDA

Load Data

library(memisc)
df = attitude

1. See basic descriptive statistics

summary(df)

##      rating        complaints     privileges       learning         raises     
##  Min.   :40.00   Min.   :37.0   Min.   :30.00   Min.   :34.00   Min.   :43.00  
##  1st Qu.:58.75   1st Qu.:58.5   1st Qu.:45.00   1st Qu.:47.00   1st Qu.:58.25  
##  Median :65.50   Median :65.0   Median :51.50   Median :56.50   Median :63.50  
##  Mean   :64.63   Mean   :66.6   Mean   :53.13   Mean   :56.37   Mean   :64.63  
##  3rd Qu.:71.75   3rd Qu.:77.0   3rd Qu.:62.50   3rd Qu.:66.75   3rd Qu.:71.00  
##  Max.   :85.00   Max.   :90.0   Max.   :83.00   Max.   :75.00   Max.   :88.00  
##     critical        advance     
##  Min.   :49.00   Min.   :25.00  
##  1st Qu.:69.25   1st Qu.:35.00  
##  Median :77.50   Median :41.00  
##  Mean   :74.77   Mean   :42.93  
##  3rd Qu.:80.00   3rd Qu.:47.75  
##  Max.   :92.00   Max.   :72.00

a) What is the difference between (attitude[3]) and (attitude$learning)

df[3] returns result as a “data frame”, df$learning as a numeric vector

class(df[3])

## [1] "data.frame"

class(df$learning)

## [1] "numeric"

2. Lists name of variables in a dataset

names(df)

## [1] "rating"     "complaints" "privileges" "learning"   "raises"    
## [6] "critical"   "advance"

3. Calculate number of rows in a dataset

nrow(df)

## [1] 30

4. Calculate number of columns in a dataset

length(df)

## [1] 7

5. List structure of a dataset

str(df)

## 'data.frame':    30 obs. of  7 variables:
##  $ rating    : num  43 63 71 61 81 43 58 71 72 67 ...
##  $ complaints: num  51 64 70 63 78 55 67 75 82 61 ...
##  $ privileges: num  30 51 68 45 56 49 42 50 72 45 ...
##  $ learning  : num  39 54 69 47 66 44 56 55 67 47 ...
##  $ raises    : num  61 63 76 54 71 54 66 70 71 62 ...
##  $ critical  : num  92 73 86 84 83 49 68 66 83 80 ...
##  $ advance   : num  45 47 48 35 47 34 35 41 31 41 ...

6. See first 6 rows (by default) of dataset

head(df)

##   rating complaints privileges learning raises critical advance
## 1     43         51         30       39     61       92      45
## 2     63         64         51       54     63       73      47
## 3     71         70         68       69     76       86      48
## 4     61         63         45       47     54       84      35
## 5     81         78         56       66     71       83      47
## 6     43         55         49       44     54       49      34

7. See first n rows of dataset - Select to see the first 15 rows of dataset

head(df, 15)

##    rating complaints privileges learning raises critical advance
## 1      43         51         30       39     61       92      45
## 2      63         64         51       54     63       73      47
## 3      71         70         68       69     76       86      48
## 4      61         63         45       47     54       84      35
## 5      81         78         56       66     71       83      47
## 6      43         55         49       44     54       49      34
## 7      58         67         42       56     66       68      35
## 8      71         75         50       55     70       66      41
## 9      72         82         72       67     71       83      31
## 10     67         61         45       47     62       80      41
## 11     64         53         53       58     58       67      34
## 12     67         60         47       39     59       74      41
## 13     69         62         57       42     55       63      25
## 14     68         83         83       45     59       77      35
## 15     77         77         54       72     79       77      46

8. See all rows but the last row

df[-nrow(df),]

##    rating complaints privileges learning raises critical advance
## 1      43         51         30       39     61       92      45
## 2      63         64         51       54     63       73      47
## 3      71         70         68       69     76       86      48
## 4      61         63         45       47     54       84      35
## 5      81         78         56       66     71       83      47
## 6      43         55         49       44     54       49      34
## 7      58         67         42       56     66       68      35
## 8      71         75         50       55     70       66      41
## 9      72         82         72       67     71       83      31
## 10     67         61         45       47     62       80      41
## 11     64         53         53       58     58       67      34
## 12     67         60         47       39     59       74      41
## 13     69         62         57       42     55       63      25
## 14     68         83         83       45     59       77      35
## 15     77         77         54       72     79       77      46
## 16     81         90         50       72     60       54      36
## 17     74         85         64       69     79       79      63
## 18     65         60         65       75     55       80      60
## 19     65         70         46       57     75       85      46
## 20     50         58         68       54     64       78      52
## 21     50         40         33       34     43       64      33
## 22     64         61         52       62     66       80      41
## 23     53         66         52       50     63       80      37
## 24     40         37         42       58     50       57      49
## 25     63         54         42       48     66       75      33
## 26     66         77         66       63     88       76      72
## 27     78         75         58       74     80       78      49
## 28     48         57         44       45     51       83      38
## 29     85         85         71       71     77       74      55

#head(df, -1) #second method

9. See last 6 rows (by default) of a dataset

tail(df)

##    rating complaints privileges learning raises critical advance
## 25     63         54         42       48     66       75      33
## 26     66         77         66       63     88       76      72
## 27     78         75         58       74     80       78      49
## 28     48         57         44       45     51       83      38
## 29     85         85         71       71     77       74      55
## 30     82         82         39       59     64       78      39

10. See last n rows of dataset - Select to see the last 12 rows of dataset.

tail(df, 12)

##    rating complaints privileges learning raises critical advance
## 19     65         70         46       57     75       85      46
## 20     50         58         68       54     64       78      52
## 21     50         40         33       34     43       64      33
## 22     64         61         52       62     66       80      41
## 23     53         66         52       50     63       80      37
## 24     40         37         42       58     50       57      49
## 25     63         54         42       48     66       75      33
## 26     66         77         66       63     88       76      72
## 27     78         75         58       74     80       78      49
## 28     48         57         44       45     51       83      38
## 29     85         85         71       71     77       74      55
## 30     82         82         39       59     64       78      39

11. See the last n rows but the first row

tail(df, -1)

##    rating complaints privileges learning raises critical advance
## 2      63         64         51       54     63       73      47
## 3      71         70         68       69     76       86      48
## 4      61         63         45       47     54       84      35
## 5      81         78         56       66     71       83      47
## 6      43         55         49       44     54       49      34
## 7      58         67         42       56     66       68      35
## 8      71         75         50       55     70       66      41
## 9      72         82         72       67     71       83      31
## 10     67         61         45       47     62       80      41
## 11     64         53         53       58     58       67      34
## 12     67         60         47       39     59       74      41
## 13     69         62         57       42     55       63      25
## 14     68         83         83       45     59       77      35
## 15     77         77         54       72     79       77      46
## 16     81         90         50       72     60       54      36
## 17     74         85         64       69     79       79      63
## 18     65         60         65       75     55       80      60
## 19     65         70         46       57     75       85      46
## 20     50         58         68       54     64       78      52
## 21     50         40         33       34     43       64      33
## 22     64         61         52       62     66       80      41
## 23     53         66         52       50     63       80      37
## 24     40         37         42       58     50       57      49
## 25     63         54         42       48     66       75      33
## 26     66         77         66       63     88       76      72
## 27     78         75         58       74     80       78      49
## 28     48         57         44       45     51       83      38
## 29     85         85         71       71     77       74      55
## 30     82         82         39       59     64       78      39

12. Number of missing values

Which function will returns number of missing values in each variable of a dataset?
To check missing values in each variable: use is.na() function together with either apply(), sapply() or colSums() functions

apply(df, 2, FUN = function(x) { sum(is.na(x)) })

##     rating complaints privileges   learning     raises   critical    advance 
##          0          0          0          0          0          0          0

#colSums(is.na(df)) #second method

13. Number of missing values in a single variable

sum(is.na(df$rating))

## [1] 0

14. Plot a simple graph, which will appear on a screen device.

with(df, plot(learning, rating, main = "Rating Attitude", col = "red", pch = 21))

15. Plot the graph shown below, and make it appear on a file device (a pdf file)

## Open PDF device; create `learning_attitude.pdf` in the working directory
pdf(file = "learning_attitude.pdf")  

## Create plot & annotate
with(df, plot(privileges, learning, main = "Learning Attitude"))

## Close dthe PDF file device
dev.off()

## png 
##   2

Graph is saved as pdf on machine