library(memisc)
## Loading required package: lattice
## Loading required package: MASS
##
## Attaching package: 'memisc'
## The following objects are masked from 'package:stats':
##
## contr.sum, contr.treatment, contrasts
## The following object is masked from 'package:base':
##
## as.array
df=data.set(Orange)
summary(df)
## Orange.Tree Orange.age Orange.circumference
## 3:7 Min. : 118.0 Min. : 30.0
## 1:7 1st Qu.: 484.0 1st Qu.: 65.5
## 5:7 Median :1004.0 Median :115.0
## 2:7 Mean : 922.1 Mean :115.9
## 4:7 3rd Qu.:1372.0 3rd Qu.:161.5
## Max. :1582.0 Max. :214.0
hist(df$Orange.circumference, col = "green")
library(datasets)
boxplot(df$Orange.circumference,
ylab= "Circumference of Orange Tree",
xlab="Orange Tree",
main="Boxplot for dataset Orange")
plot(df$Orange.age, df$Orange.circumference, xlab="Orange Tree Age", ylab="Orange Tree Circumference")
abline(lm(df$Orange.circumference ~ df$Orange.age, data=Orange), col="blue")
## Note: method with signature 'numeric.item#numeric' chosen for function '-',
## target signature 'double.item#double.item'.
## "numeric#numeric.item" would also be valid
As we can view from the data shown above, Exploratory Data Analysis(EDA) can show us that the dataset “Orange” does not contain anomalies and does not have missing values. And based on the scatter plot, a pattern can be discovered, which is the older the age of orange tree, the larger the circumference of orange tree.
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.1.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x purrr::%@%() masks memisc::%@%()
## x dplyr::collect() masks memisc::collect()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x dplyr::recode() masks memisc::recode()
## x dplyr::rename() masks memisc::rename()
## x dplyr::select() masks MASS::select()
## x dplyr::syms() masks ggplot2::syms(), memisc::syms()
## x tibble::view() masks memisc::view()
attributes(df$Orange.circumference)
## $value.labels
## `\001NULL\001`
##
## $value.filter
## `\001NULL\001`
##
## $measurement
## [1] "interval"
##
## $annotation
##
## $class
## [1] "double.item"
## attr(,"package")
## [1] "memisc"
codebook <- map_df(df, function(x) attributes(x)$measurement) %>%
gather(key = Orange_Tree_Data, value = Type_of_Measurement)
view(codebook)
codebook <- codebook %>%
mutate(Type = map_chr(df, typeof),
Mean = map_dbl(df, mean, na.rm = T))
codebook
## # A tibble: 3 x 4
## Orange_Tree_Data Type_of_Measurement Type Mean
## <chr> <chr> <chr> <dbl>
## 1 Orange.Tree ordinal integer 3
## 2 Orange.age interval double 922.
## 3 Orange.circumference interval double 116.
library(dplyr)
mydata = read.csv("xboxgames.csv")
mydata1 = filter(mydata, publisher == "Xbox Game Studios")
mydata1
## gameid name publisher developer
## 1 5907 Quantum Break Xbox Game Studios Remedy Entertainment
## genre
## 1 Third Person Shooter
The filter() function will filter variables based on what the user wants to filter. In mydata1, the filter() function will display the games with publisher Xbox Game Studios only and will not display other games.
### arrange()
mydata2 = arrange(mydata, gameid)
mydata2
## gameid name
## 1 -5215 QUByte Classics - Brave Battle Saga: The Legend of the Magic Warrior
## 2 -5214 QUByte Classics - Canon: Legends of the New Gods
## 3 -5213 QUByte Classics - The Humans
## 4 -4055 Quantum League
## 5 5520 Quantum Rush: Champions
## 6 5530 Q
## 7 5561 Q.U.B.E. Director's Cut
## 8 5819 Q*bert REBOOTED: The XBOX One @!#?@! Edition
## 9 5907 Quantum Break
## 10 6052 Quatros Origins
## 11 6896 Quantic Pinball
## 12 6951 Q.U.B.E. 2
## 13 7600 Queen's Quest 2: Stories of Forgotten Past
## 14 7722 Qubit's Quest
## 15 7775 Queen's Quest 3: The End of Dawn
## 16 8053 Queen's Quest 4: Sacred Truce
## 17 9278 Quantum Replica
## 18 9781 Quake
## 19 10031 QUByte Classics - The Immortal by PIKO
## 20 10200 Queen's Quest 5: Symphony of Death
## publisher developer
## 1
## 2
## 3
## 4
## 5 GameArt Studio GameArt Studio
## 6 liica liica, OrangeBox
## 7 GRIP Digital Toxic Games, GRIP Digital
## 8 GPC LOOT Interactive
## 9 Xbox Game Studios Remedy Entertainment
## 10 God As A Cucumber God As A Cucumber
## 11 Plug In Digital Shine Research
## 12 Trapped Nerve Games Toxic Games
## 13 Artifex Mundi Brave Giant
## 14 Performance Designed Products Two Okes Entertainment
## 15 Artifex Mundi Brave Giant
## 16 Artifex Mundi Brave Giant
## 17 PQube ON3D Studios
## 18 Bethesda Softworks MachineGames, Nightdive Studios, id Software
## 19 QUByte Interactive Piko Interactive
## 20 Artifex Mundi Brave Giant
## genre
## 1
## 2
## 3
## 4
## 5 Arcade Racing
## 6 Puzzle
## 7 Puzzle, Platformer
## 8 Action
## 9 Third Person Shooter
## 10 Puzzle
## 11 Pinball
## 12 Puzzle, Platformer
## 13 Adventure, Point & Click
## 14 Platformer
## 15 Adventure, Point & Click
## 16 Adventure, Point & Click
## 17 Stealth
## 18 First Person Shooter
## 19 Action-Adventure
## 20 Adventure, Point & Click
The arrange() function will arrange the data in order. In mydata2, the arrange() function will display the games in ascending order of their GameID.
mydata3 = mutate(mydata, NewGameID=gameid+5000)
mydata3
## gameid name
## 1 5530 Q
## 2 5819 Q*bert REBOOTED: The XBOX One @!#?@! Edition
## 3 6951 Q.U.B.E. 2
## 4 5561 Q.U.B.E. Director's Cut
## 5 9781 Quake
## 6 6896 Quantic Pinball
## 7 5907 Quantum Break
## 8 -4055 Quantum League
## 9 9278 Quantum Replica
## 10 5520 Quantum Rush: Champions
## 11 6052 Quatros Origins
## 12 7722 Qubit's Quest
## 13 -5215 QUByte Classics - Brave Battle Saga: The Legend of the Magic Warrior
## 14 -5214 QUByte Classics - Canon: Legends of the New Gods
## 15 -5213 QUByte Classics - The Humans
## 16 10031 QUByte Classics - The Immortal by PIKO
## 17 7600 Queen's Quest 2: Stories of Forgotten Past
## 18 7775 Queen's Quest 3: The End of Dawn
## 19 8053 Queen's Quest 4: Sacred Truce
## 20 10200 Queen's Quest 5: Symphony of Death
## publisher developer
## 1 liica liica, OrangeBox
## 2 GPC LOOT Interactive
## 3 Trapped Nerve Games Toxic Games
## 4 GRIP Digital Toxic Games, GRIP Digital
## 5 Bethesda Softworks MachineGames, Nightdive Studios, id Software
## 6 Plug In Digital Shine Research
## 7 Xbox Game Studios Remedy Entertainment
## 8
## 9 PQube ON3D Studios
## 10 GameArt Studio GameArt Studio
## 11 God As A Cucumber God As A Cucumber
## 12 Performance Designed Products Two Okes Entertainment
## 13
## 14
## 15
## 16 QUByte Interactive Piko Interactive
## 17 Artifex Mundi Brave Giant
## 18 Artifex Mundi Brave Giant
## 19 Artifex Mundi Brave Giant
## 20 Artifex Mundi Brave Giant
## genre NewGameID
## 1 Puzzle 10530
## 2 Action 10819
## 3 Puzzle, Platformer 11951
## 4 Puzzle, Platformer 10561
## 5 First Person Shooter 14781
## 6 Pinball 11896
## 7 Third Person Shooter 10907
## 8 945
## 9 Stealth 14278
## 10 Arcade Racing 10520
## 11 Puzzle 11052
## 12 Platformer 12722
## 13 -215
## 14 -214
## 15 -213
## 16 Action-Adventure 15031
## 17 Adventure, Point & Click 12600
## 18 Adventure, Point & Click 12775
## 19 Adventure, Point & Click 13053
## 20 Adventure, Point & Click 15200
The mutate() function will create new variables and change the created variables according to how the user wants. In mydata3, the mutate() function will add a new variable named NewGameID which have values of each game id plus 5000.
mydata4 = select(mydata, gameid:publisher, genre)
mydata4
## gameid name
## 1 5530 Q
## 2 5819 Q*bert REBOOTED: The XBOX One @!#?@! Edition
## 3 6951 Q.U.B.E. 2
## 4 5561 Q.U.B.E. Director's Cut
## 5 9781 Quake
## 6 6896 Quantic Pinball
## 7 5907 Quantum Break
## 8 -4055 Quantum League
## 9 9278 Quantum Replica
## 10 5520 Quantum Rush: Champions
## 11 6052 Quatros Origins
## 12 7722 Qubit's Quest
## 13 -5215 QUByte Classics - Brave Battle Saga: The Legend of the Magic Warrior
## 14 -5214 QUByte Classics - Canon: Legends of the New Gods
## 15 -5213 QUByte Classics - The Humans
## 16 10031 QUByte Classics - The Immortal by PIKO
## 17 7600 Queen's Quest 2: Stories of Forgotten Past
## 18 7775 Queen's Quest 3: The End of Dawn
## 19 8053 Queen's Quest 4: Sacred Truce
## 20 10200 Queen's Quest 5: Symphony of Death
## publisher genre
## 1 liica Puzzle
## 2 GPC Action
## 3 Trapped Nerve Games Puzzle, Platformer
## 4 GRIP Digital Puzzle, Platformer
## 5 Bethesda Softworks First Person Shooter
## 6 Plug In Digital Pinball
## 7 Xbox Game Studios Third Person Shooter
## 8
## 9 PQube Stealth
## 10 GameArt Studio Arcade Racing
## 11 God As A Cucumber Puzzle
## 12 Performance Designed Products Platformer
## 13
## 14
## 15
## 16 QUByte Interactive Action-Adventure
## 17 Artifex Mundi Adventure, Point & Click
## 18 Artifex Mundi Adventure, Point & Click
## 19 Artifex Mundi Adventure, Point & Click
## 20 Artifex Mundi Adventure, Point & Click
The select() function will select specific variables to be displayed. In mydata4, the select() function selects and displays the columns “gameid” to “publisher” and variable “genre”
mydata5 = summarise_at(mydata, vars(name, publisher, developer), list(~n(), missing = ~sum(is.na(.))))
mydata5
## name_n publisher_n developer_n name_missing publisher_missing
## 1 20 20 20 0 0
## developer_missing
## 1 0
The summarise() function basically summarises data. In mydata5, the summarise() function compute the number of records and the number of missing values for each variable specified, which is variables “name”, “publisher” and “developer”.