Adapted from: “Getting and Cleaning Data”, week 3 - lesson 2 by Johns Hopkins University (Coursera)



Getting the data…

Superheroes <- read.csv("~/Data Science/Datasets/superheroes_information.csv")


Look at a bit of the data

head(Superheroes, n=10)
tail(Superheroes, n=5)


Make summary

summary(Superheroes)
       X                 name        Gender      Eye.color                  Race       Hair.color 
 Min.   :  0.0   Goliath   :  3   -     : 29   blue   :225   -                :304   -      :172  
 1st Qu.:183.2   Spider-Man:  3   Female:200   -      :172   Human            :208   Black  :158  
 Median :366.5   Angel     :  2   Male  :505   brown  :126   Mutant           : 63   Blond  : 99  
 Mean   :366.5   Atlas     :  2                green  : 73   God / Eternal    : 14   Brown  : 86  
 3rd Qu.:549.8   Atom      :  2                red    : 46   Cyborg           : 11   No Hair: 75  
 Max.   :733.0   Batgirl   :  2                black  : 23   Human / Radiation: 11   Red    : 51  
                 (Other)   :720                (Other): 69   (Other)          :123   (Other): 93  
     Height                  Publisher     Skin.color    Alignment       Weight      
 Min.   :-99.0   Marvel Comics    :388   -      :662   -      :  7   Min.   :-99.00  
 1st Qu.:-99.0   DC Comics        :215   green  : 21   bad    :207   1st Qu.:-99.00  
 Median :175.0   NBC - Heroes     : 19   blue   :  9   good   :496   Median : 62.00  
 Mean   :102.3   Dark Horse Comics: 18   red    :  9   neutral: 24   Mean   : 43.86  
 3rd Qu.:185.0                    : 15   white  :  7                 3rd Qu.: 90.00  
 Max.   :975.0   George Lucas     : 14   grey   :  5                 Max.   :900.00  
                 (Other)          : 65   (Other): 21                 NA's   :2       


More in depth information

str(Superheroes)
'data.frame':   734 obs. of  11 variables:
 $ X         : int  0 1 2 3 4 5 6 7 8 9 ...
 $ name      : Factor w/ 715 levels "Abe Sapien","Abin Sur",..: 3 1 2 4 5 6 7 8 9 10 ...
 $ Gender    : Factor w/ 3 levels "-","Female","Male": 3 3 3 3 3 3 3 3 2 3 ...
 $ Eye.color : Factor w/ 23 levels "-","amber","black",..: 20 4 4 9 4 4 4 4 4 7 ...
 $ Race      : Factor w/ 62 levels "-","Alien","Alpha",..: 24 33 56 29 12 24 1 24 1 24 ...
 $ Hair.color: Factor w/ 30 levels "-","Auburn","black",..: 18 18 18 18 4 18 7 7 7 9 ...
 $ Height    : num  203 191 185 203 -99 193 -99 185 173 178 ...
 $ Publisher : Factor w/ 25 levels "","ABC Studios",..: 13 3 4 13 13 13 15 4 13 13 ...
 $ Skin.color: Factor w/ 17 levels "-","black","blue",..: 1 3 13 1 1 1 1 1 1 1 ...
 $ Alignment : Factor w/ 4 levels "-","bad","good",..: 3 3 3 2 2 2 3 3 3 3 ...
 $ Weight    : num  441 65 90 441 -99 122 -99 88 61 81 ...


Quantiles of quantitative variables

quantile(Superheroes$Height, na.rm = T)
  0%  25%  50%  75% 100% 
 -99  -99  175  185  975 
quantile(Superheroes$Weight, probs = c(0, 0.33, 0.67, 1), na.rm = T)
    0%    33%    67%   100% 
-99.00  17.23  83.00 900.00 


Make table

table(Superheroes$Race, useNA = "ifany")

                 -              Alien              Alpha             Amazon            Android 
               304                  7                  5                  2                  9 
            Animal          Asgardian          Atlantean            Bizarro         Bolovaxian 
                 4                  5                  5                  1                  1 
             Clone      Cosmic Entity             Cyborg           Czarnian Dathomirian Zabrak 
                 1                  4                 11                  1                  1 
          Demi-God              Demon            Eternal     Flora Colossus        Frost Giant 
                 2                  6                  2                  1                  2 
     God / Eternal            Gorilla             Gungan              Human    Human / Altered 
                14                  1                  1                208                  3 
     Human / Clone     Human / Cosmic         Human-Kree  Human / Radiation      Human-Spartoi 
                 1                  2                  2                 11                  1 
      Human-Vulcan    Human-Vuldarian      Icthyo Sapien            Inhuman              Kaiju 
                 1                  1                  1                  4                  1 
   Kakarantharaian          Korugaran         Kryptonian          Luphomoid              Maiar 
                 1                  1                  7                  1                  1 
           Martian          Metahuman             Mutant     Mutant / Clone            New God 
                 1                  2                 63                  1                  3 
          Neyaphem          Parademon             Planet             Rodian             Saiyan 
                 1                  1                  1                  1                  2 
           Spartoi          Strontian           Symbiote           Talokite         Tamaranean 
                 1                  1                  9                  1                  1 
           Ungaran            Vampire    Xenomorph XX121             Yautja     Yoda's species 
                 1                  2                  1                  1                  1 
     Zen-Whoberian             Zombie 
                 1                  1 
table(Superheroes$Race, Superheroes$Gender)
                    
                       - Female Male
  -                   22     98  184
  Alien                0      2    5
  Alpha                0      2    3
  Amazon               0      2    0
  Android              0      0    9
  Animal               0      0    4
  Asgardian            0      3    2
  Atlantean            0      2    3
  Bizarro              0      0    1
  Bolovaxian           0      0    1
  Clone                0      0    1
  Cosmic Entity        2      0    2
  Cyborg               0      3    8
  Czarnian             0      0    1
  Dathomirian Zabrak   0      0    1
  Demi-God             0      1    1
  Demon                0      0    6
  Eternal              0      0    2
  Flora Colossus       0      0    1
  Frost Giant          0      0    2
  God / Eternal        2      0   12
  Gorilla              0      0    1
  Gungan               0      0    1
  Human                1     50  157
  Human / Altered      0      0    3
  Human / Clone        0      0    1
  Human / Cosmic       0      1    1
  Human-Kree           0      2    0
  Human / Radiation    0      2    9
  Human-Spartoi        0      0    1
  Human-Vulcan         0      0    1
  Human-Vuldarian      0      0    1
  Icthyo Sapien        0      0    1
  Inhuman              0      2    2
  Kaiju                1      0    0
  Kakarantharaian      0      0    1
  Korugaran            0      0    1
  Kryptonian           0      3    4
  Luphomoid            0      1    0
  Maiar                0      0    1
  Martian              0      0    1
  Metahuman            0      1    1
  Mutant               0     20   43
  Mutant / Clone       0      1    0
  New God              0      1    2
  Neyaphem             0      0    1
  Parademon            1      0    0
  Planet               0      0    1
  Rodian               0      0    1
  Saiyan               0      0    2
  Spartoi              0      0    1
  Strontian            0      0    1
  Symbiote             0      0    9
  Talokite             0      1    0
  Tamaranean           0      1    0
  Ungaran              0      0    1
  Vampire              0      0    2
  Xenomorph XX121      0      0    1
  Yautja               0      0    1
  Yoda's species       0      0    1
  Zen-Whoberian        0      1    0
  Zombie               0      0    1


Check for missing values

sum(is.na(Superheroes))
[1] 2
any(is.na(Superheroes[,c("Height", "Weight")]))
[1] TRUE
all(Superheroes$Weight > 0)
[1] FALSE


Check for NAs with row and column sums

colSums(is.na(Superheroes))
         X       name     Gender  Eye.color       Race Hair.color     Height  Publisher Skin.color 
         0          0          0          0          0          0          0          0          0 
 Alignment     Weight 
         0          2 
all(is.na(Superheroes)==0)
[1] FALSE


Values with specific characteristics

table(Superheroes$Hair.color %in% "No Hair")

FALSE  TRUE 
  659    75 
table(Superheroes$Publisher %in% c("DC Comics", "Marvel Comics"))

FALSE  TRUE 
  131   603 


Subset by values with specific characteristics

Superheroes[Superheroes$Publisher %in% c("DC Comics", "Marvel Comics"), c("name", "Gender", "Publisher")]


Filtering data and add variable

Superheroes$BMI <- round((Superheroes$Weight/(Superheroes$Height*0.01)/(Superheroes$Height*0.01)), 3)
SH <- Superheroes[Superheroes$Height > 0 & Superheroes$Height <= 250 & Superheroes$Weight > 0 & Superheroes$Weight <= 300,]
SH[order(SH$BMI, SH$Publisher),]




Cross tabs 1

SHBMI <- xtabs(BMI ~ Gender + Alignment, data = SH)
SHBMI
        Alignment
Gender          -      bad     good  neutral
  -         0.000   24.990  291.051    0.000
  Female    0.000  522.165 2441.207   38.372
  Male     49.085 5448.231 5687.908  458.538


Open a new dataset

data("UCBAdmissions")
DF = as.data.frame(UCBAdmissions)
summary(DF)
      Admit       Gender   Dept       Freq      
 Admitted:12   Male  :12   A:4   Min.   :  8.0  
 Rejected:12   Female:12   B:4   1st Qu.: 80.0  
                           C:4   Median :170.0  
                           D:4   Mean   :188.6  
                           E:4   3rd Qu.:302.5  
                           F:4   Max.   :512.0  


Cross tabs 2

xt <- xtabs(Freq ~ Gender + Admit, data=DF)
xt
        Admit
Gender   Admitted Rejected
  Male       1198     1493
  Female      557     1278


Another new dataset for example

data("warpbreaks")
warpbreaks$replicate <- rep(1:9, len=54)
warpbreaks


Cross tabs 3

xt = xtabs(breaks ~., data=warpbreaks)
xt
, , replicate = 1

    tension
wool  L  M  H
   A 26 18 36
   B 27 42 20

, , replicate = 2

    tension
wool  L  M  H
   A 30 21 21
   B 14 26 21

, , replicate = 3

    tension
wool  L  M  H
   A 54 29 24
   B 29 19 24

, , replicate = 4

    tension
wool  L  M  H
   A 25 17 18
   B 19 16 17

, , replicate = 5

    tension
wool  L  M  H
   A 70 12 10
   B 29 39 13

, , replicate = 6

    tension
wool  L  M  H
   A 52 18 43
   B 31 28 15

, , replicate = 7

    tension
wool  L  M  H
   A 51 35 28
   B 41 21 15

, , replicate = 8

    tension
wool  L  M  H
   A 26 30 15
   B 20 39 16

, , replicate = 9

    tension
wool  L  M  H
   A 67 36 26
   B 44 29 28


Flat tables

ftable(xt)
             replicate  1  2  3  4  5  6  7  8  9
wool tension                                     
A    L                 26 30 54 25 70 52 51 26 67
     M                 18 21 29 17 12 18 35 30 36
     H                 36 21 24 18 10 43 28 15 26
B    L                 27 14 29 19 29 31 41 20 44
     M                 42 26 19 16 39 28 21 39 29
     H                 20 21 24 17 13 15 15 16 28


Size of the dataset

fakeData <- rnorm(10^6)


object.size(Superheroes)
102872 bytes
object.size(fakeData)
8000040 bytes


Print file size

print(object.size(fakeData), units = "Mb")
7.6 Mb



END

LS0tCnRpdGxlOiAiU3VtbWFyaXppbmcgRGF0YSB3aXRoIFN1cGVyaGVyb2VzIERhdGFzZXQgYW5kIG1vcmUiCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KCiMjIyMgQWRhcHRlZCBmcm9tOiAiR2V0dGluZyBhbmQgQ2xlYW5pbmcgRGF0YSIsIHdlZWsgMyAtIGxlc3NvbiAyIGJ5IEpvaG5zIEhvcGtpbnMgVW5pdmVyc2l0eSAoQ291cnNlcmEpCgo8YnIgLz4KPGJyIC8+CgpHZXR0aW5nIHRoZSBkYXRhLi4uCgpgYGB7cn0KU3VwZXJoZXJvZXMgPC0gcmVhZC5jc3YoIn4vRGF0YSBTY2llbmNlL0RhdGFzZXRzL3N1cGVyaGVyb2VzX2luZm9ybWF0aW9uLmNzdiIpCmBgYAoKPGJyIC8+Ckxvb2sgYXQgYSBiaXQgb2YgdGhlIGRhdGEKYGBge3J9CmhlYWQoU3VwZXJoZXJvZXMsIG49MTApCmBgYAoKYGBge3J9CnRhaWwoU3VwZXJoZXJvZXMsIG49NSkKYGBgCgo8YnIgLz4KTWFrZSBzdW1tYXJ5CmBgYHtyfQpzdW1tYXJ5KFN1cGVyaGVyb2VzKQpgYGAKCjxiciAvPgpNb3JlIGluIGRlcHRoIGluZm9ybWF0aW9uCmBgYHtyfQpzdHIoU3VwZXJoZXJvZXMpCmBgYAoKPGJyIC8+ClF1YW50aWxlcyBvZiBxdWFudGl0YXRpdmUgdmFyaWFibGVzCmBgYHtyfQpxdWFudGlsZShTdXBlcmhlcm9lcyRIZWlnaHQsIG5hLnJtID0gVCkKYGBgCmBgYHtyfQpxdWFudGlsZShTdXBlcmhlcm9lcyRXZWlnaHQsIHByb2JzID0gYygwLCAwLjMzLCAwLjY3LCAxKSwgbmEucm0gPSBUKQpgYGAKCjxici8+Ck1ha2UgdGFibGUKYGBge3J9CnRhYmxlKFN1cGVyaGVyb2VzJFJhY2UsIHVzZU5BID0gImlmYW55IikKYGBgCmBgYHtyfQp0YWJsZShTdXBlcmhlcm9lcyRSYWNlLCBTdXBlcmhlcm9lcyRHZW5kZXIpCmBgYAoKPGJyIC8+CkNoZWNrIGZvciBtaXNzaW5nIHZhbHVlcwpgYGB7cn0Kc3VtKGlzLm5hKFN1cGVyaGVyb2VzKSkKYGBgCmBgYHtyfQphbnkoaXMubmEoU3VwZXJoZXJvZXNbLGMoIkhlaWdodCIsICJXZWlnaHQiKV0pKQpgYGAKYGBge3J9CmFsbChTdXBlcmhlcm9lcyRXZWlnaHQgPiAwKQpgYGAKCjxiciAvPgpDaGVjayBmb3IgTkFzIHdpdGggcm93IGFuZCBjb2x1bW4gc3VtcwpgYGB7cn0KY29sU3Vtcyhpcy5uYShTdXBlcmhlcm9lcykpCmBgYApgYGB7cn0KYWxsKGlzLm5hKFN1cGVyaGVyb2VzKT09MCkKYGBgCgo8YnIgLz4KVmFsdWVzIHdpdGggc3BlY2lmaWMgY2hhcmFjdGVyaXN0aWNzCgpgYGB7cn0KdGFibGUoU3VwZXJoZXJvZXMkSGFpci5jb2xvciAlaW4lICJObyBIYWlyIikKYGBgCmBgYHtyfQp0YWJsZShTdXBlcmhlcm9lcyRQdWJsaXNoZXIgJWluJSBjKCJEQyBDb21pY3MiLCAiTWFydmVsIENvbWljcyIpKQpgYGAKCjxiciAvPgpTdWJzZXQgYnkgdmFsdWVzIHdpdGggc3BlY2lmaWMgY2hhcmFjdGVyaXN0aWNzCmBgYHtyfQpTdXBlcmhlcm9lc1tTdXBlcmhlcm9lcyRQdWJsaXNoZXIgJWluJSBjKCJEQyBDb21pY3MiLCAiTWFydmVsIENvbWljcyIpLCBjKCJuYW1lIiwgIkdlbmRlciIsICJQdWJsaXNoZXIiKV0KYGBgCgo8YnIgLz4KRmlsdGVyaW5nIGRhdGEgYW5kIGFkZCB2YXJpYWJsZQpgYGB7cn0KU3VwZXJoZXJvZXMkQk1JIDwtIHJvdW5kKChTdXBlcmhlcm9lcyRXZWlnaHQvKFN1cGVyaGVyb2VzJEhlaWdodCowLjAxKS8oU3VwZXJoZXJvZXMkSGVpZ2h0KjAuMDEpKSwgMykKClNIIDwtIFN1cGVyaGVyb2VzW1N1cGVyaGVyb2VzJEhlaWdodCA+IDAgJiBTdXBlcmhlcm9lcyRIZWlnaHQgPD0gMjUwICYgU3VwZXJoZXJvZXMkV2VpZ2h0ID4gMCAmIFN1cGVyaGVyb2VzJFdlaWdodCA8PSAzMDAsXQpTSFtvcmRlcihTSCRCTUksIFNIJFB1Ymxpc2hlciksXQpgYGAKCjxiciAvPgoKLS0tCgo8YnIgLz4KQ3Jvc3MgdGFicyAxCmBgYHtyfQpTSEJNSSA8LSB4dGFicyhCTUkgfiBHZW5kZXIgKyBBbGlnbm1lbnQsIGRhdGEgPSBTSCkKU0hCTUkKYGBgCgo8YnIgLz4KT3BlbiBhIG5ldyBkYXRhc2V0CmBgYHtyfQpkYXRhKCJVQ0JBZG1pc3Npb25zIikKREYgPSBhcy5kYXRhLmZyYW1lKFVDQkFkbWlzc2lvbnMpCnN1bW1hcnkoREYpCmBgYAoKPGJyIC8+CkNyb3NzIHRhYnMgMgpgYGB7cn0KeHQgPC0geHRhYnMoRnJlcSB+IEdlbmRlciArIEFkbWl0LCBkYXRhPURGKQp4dApgYGAKCjxiciAvPgpBbm90aGVyIG5ldyBkYXRhc2V0IGZvciBleGFtcGxlCmBgYHtyfQpkYXRhKCJ3YXJwYnJlYWtzIikKd2FycGJyZWFrcyRyZXBsaWNhdGUgPC0gcmVwKDE6OSwgbGVuPTU0KQp3YXJwYnJlYWtzCmBgYAoKPGJyIC8+CkNyb3NzIHRhYnMgMwpgYGB7cn0KeHQgPSB4dGFicyhicmVha3Mgfi4sIGRhdGE9d2FycGJyZWFrcykKeHQKYGBgCgo8YnIgLz4KRmxhdCB0YWJsZXMKYGBge3J9CmZ0YWJsZSh4dCkKYGBgCgo8YnIgLz4KU2l6ZSBvZiB0aGUgZGF0YXNldApgYGB7cn0KZmFrZURhdGEgPC0gcm5vcm0oMTBeNikKYGBgCgo8YnIgLz4KYGBge3J9Cm9iamVjdC5zaXplKFN1cGVyaGVyb2VzKQpvYmplY3Quc2l6ZShmYWtlRGF0YSkKYGBgCgo8YnIgLz4KUHJpbnQgZmlsZSBzaXplCmBgYHtyfQpwcmludChvYmplY3Quc2l6ZShmYWtlRGF0YSksIHVuaXRzID0gIk1iIikKYGBgCgo8YnIgLz4KCi0tLQoKPGNlbnRlcj5FTkQ8L2NlbnRlcj4KCi0tLQ==