##Libraries that will be needed/used
library(funModeling)
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
## funModeling v.1.9.4 :)
## Examples and tutorials at livebook.datascienceheroes.com
## / Now in Spanish: librovivodecienciadedatos.ai
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ tibble 3.1.6 ✓ dplyr 1.0.8
## ✓ tidyr 1.2.0 ✓ stringr 1.4.0
## ✓ readr 2.1.2 ✓ forcats 0.5.1
## ✓ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x dplyr::src() masks Hmisc::src()
## x dplyr::summarize() masks Hmisc::summarize()
library(Hmisc)
Explaination : Those library are used for explorartory data analysis
library (readr)
mydata <- read_csv("bike_buyers.csv")
## Rows: 1000 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): Marital Status, Gender, Education, Occupation, Home Owner, Commute ...
## dbl (5): ID, Income, Children, Cars, Age
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
mydata
## # A tibble: 1,000 × 13
## ID `Marital Status` Gender Income Children Education Occupation
## <dbl> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 12496 Married Female 40000 1 Bachelors Skilled Ma…
## 2 24107 Married Male 30000 3 Partial College Clerical
## 3 14177 Married Male 80000 5 Partial College Profession…
## 4 24381 Single <NA> 70000 0 Bachelors Profession…
## 5 25597 Single Male 30000 0 Bachelors Clerical
## 6 13507 Married Female 10000 2 Partial College Manual
## 7 27974 Single Male 160000 2 High School Management
## 8 19364 Married Male 40000 1 Bachelors Skilled Ma…
## 9 22155 <NA> Male 20000 2 Partial High School Clerical
## 10 19280 Married Male NA 2 Partial College Manual
## # … with 990 more rows, and 6 more variables: `Home Owner` <chr>, Cars <dbl>,
## # `Commute Distance` <chr>, Region <chr>, Age <dbl>, `Purchased Bike` <chr>
Explaination : The function used above is to import and show us the dataset that has been given.
dim(mydata)
## [1] 1000 13
Explaination: The dim() function is used to see the data’s dimension.
str(mydata)
## spec_tbl_df [1,000 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ID : num [1:1000] 12496 24107 14177 24381 25597 ...
## $ Marital Status : chr [1:1000] "Married" "Married" "Married" "Single" ...
## $ Gender : chr [1:1000] "Female" "Male" "Male" NA ...
## $ Income : num [1:1000] 40000 30000 80000 70000 30000 10000 160000 40000 20000 NA ...
## $ Children : num [1:1000] 1 3 5 0 0 2 2 1 2 2 ...
## $ Education : chr [1:1000] "Bachelors" "Partial College" "Partial College" "Bachelors" ...
## $ Occupation : chr [1:1000] "Skilled Manual" "Clerical" "Professional" "Professional" ...
## $ Home Owner : chr [1:1000] "Yes" "Yes" "No" "Yes" ...
## $ Cars : num [1:1000] 0 1 2 1 0 0 4 0 2 1 ...
## $ Commute Distance: chr [1:1000] "0-1 Miles" "0-1 Miles" "2-5 Miles" "5-10 Miles" ...
## $ Region : chr [1:1000] "Europe" "Europe" "Europe" "Pacific" ...
## $ Age : num [1:1000] 42 43 60 41 36 50 33 43 58 NA ...
## $ Purchased Bike : chr [1:1000] "No" "No" "No" "Yes" ...
## - attr(*, "spec")=
## .. cols(
## .. ID = col_double(),
## .. `Marital Status` = col_character(),
## .. Gender = col_character(),
## .. Income = col_double(),
## .. Children = col_double(),
## .. Education = col_character(),
## .. Occupation = col_character(),
## .. `Home Owner` = col_character(),
## .. Cars = col_double(),
## .. `Commute Distance` = col_character(),
## .. Region = col_character(),
## .. Age = col_double(),
## .. `Purchased Bike` = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
Explaination : This function allows us to see the string type of the dataset, and each variable has it’s own string.
library(Hmisc)
describe(mydata)
## mydata
##
## 13 Variables 1000 Observations
## --------------------------------------------------------------------------------
## ID
## n missing distinct Info Mean Gmd .05 .10
## 1000 0 1000 1 19966 6176 11781 12627
## .25 .50 .75 .90 .95
## 15291 19744 24471 27544 28413
##
## lowest : 11000 11047 11061 11090 11116, highest: 29337 29355 29380 29424 29447
## --------------------------------------------------------------------------------
## Marital Status
## n missing distinct
## 993 7 2
##
## Value Married Single
## Frequency 535 458
## Proportion 0.539 0.461
## --------------------------------------------------------------------------------
## Gender
## n missing distinct
## 989 11 2
##
## Value Female Male
## Frequency 489 500
## Proportion 0.494 0.506
## --------------------------------------------------------------------------------
## Income
## n missing distinct Info Mean Gmd .05 .10
## 994 6 16 0.986 56268 34273 10000 20000
## .25 .50 .75 .90 .95
## 30000 60000 70000 100000 120000
##
## lowest : 10000 20000 30000 40000 50000, highest: 120000 130000 150000 160000 170000
##
## Value 10000 20000 30000 40000 50000 60000 70000 80000 90000
## Frequency 73 74 134 153 40 165 123 90 38
## Proportion 0.073 0.074 0.135 0.154 0.040 0.166 0.124 0.091 0.038
##
## Value 100000 110000 120000 130000 150000 160000 170000
## Frequency 29 16 17 32 4 3 3
## Proportion 0.029 0.016 0.017 0.032 0.004 0.003 0.003
## --------------------------------------------------------------------------------
## Children
## n missing distinct Info Mean Gmd
## 992 8 6 0.96 1.91 1.827
##
## lowest : 0 1 2 3 4, highest: 1 2 3 4 5
##
## Value 0 1 2 3 4 5
## Frequency 274 169 209 133 126 81
## Proportion 0.276 0.170 0.211 0.134 0.127 0.082
## --------------------------------------------------------------------------------
## Education
## n missing distinct
## 1000 0 5
##
## lowest : Bachelors Graduate Degree High School Partial College Partial High School
## highest: Bachelors Graduate Degree High School Partial College Partial High School
##
## Value Bachelors Graduate Degree High School
## Frequency 306 174 179
## Proportion 0.306 0.174 0.179
##
## Value Partial College Partial High School
## Frequency 265 76
## Proportion 0.265 0.076
## --------------------------------------------------------------------------------
## Occupation
## n missing distinct
## 1000 0 5
##
## lowest : Clerical Management Manual Professional Skilled Manual
## highest: Clerical Management Manual Professional Skilled Manual
##
## Value Clerical Management Manual Professional
## Frequency 177 173 119 276
## Proportion 0.177 0.173 0.119 0.276
##
## Value Skilled Manual
## Frequency 255
## Proportion 0.255
## --------------------------------------------------------------------------------
## Home Owner
## n missing distinct
## 996 4 2
##
## Value No Yes
## Frequency 314 682
## Proportion 0.315 0.685
## --------------------------------------------------------------------------------
## Cars
## n missing distinct Info Mean Gmd
## 991 9 5 0.925 1.455 1.226
##
## lowest : 0 1 2 3 4, highest: 0 1 2 3 4
##
## Value 0 1 2 3 4
## Frequency 238 267 342 85 59
## Proportion 0.240 0.269 0.345 0.086 0.060
## --------------------------------------------------------------------------------
## Commute Distance
## n missing distinct
## 1000 0 5
##
## lowest : 0-1 Miles 1-2 Miles 10+ Miles 2-5 Miles 5-10 Miles
## highest: 0-1 Miles 1-2 Miles 10+ Miles 2-5 Miles 5-10 Miles
##
## Value 0-1 Miles 1-2 Miles 10+ Miles 2-5 Miles 5-10 Miles
## Frequency 366 169 111 162 192
## Proportion 0.366 0.169 0.111 0.162 0.192
## --------------------------------------------------------------------------------
## Region
## n missing distinct
## 1000 0 3
##
## Value Europe North America Pacific
## Frequency 300 508 192
## Proportion 0.300 0.508 0.192
## --------------------------------------------------------------------------------
## Age
## n missing distinct Info Mean Gmd .05 .10
## 992 8 53 0.999 44.18 12.85 28.00 30.00
## .25 .50 .75 .90 .95
## 35.00 43.00 52.00 60.90 65.45
##
## lowest : 25 26 27 28 29, highest: 73 74 78 80 89
## --------------------------------------------------------------------------------
## Purchased Bike
## n missing distinct
## 1000 0 2
##
## Value No Yes
## Frequency 519 481
## Proportion 0.519 0.481
## --------------------------------------------------------------------------------
status(mydata)
## variable q_zeros p_zeros q_na p_na q_inf p_inf
## ID ID 0 0.000 0 0.000 0 0
## Marital Status Marital Status 0 0.000 7 0.007 0 0
## Gender Gender 0 0.000 11 0.011 0 0
## Income Income 0 0.000 6 0.006 0 0
## Children Children 274 0.274 8 0.008 0 0
## Education Education 0 0.000 0 0.000 0 0
## Occupation Occupation 0 0.000 0 0.000 0 0
## Home Owner Home Owner 0 0.000 4 0.004 0 0
## Cars Cars 238 0.238 9 0.009 0 0
## Commute Distance Commute Distance 0 0.000 0 0.000 0 0
## Region Region 0 0.000 0 0.000 0 0
## Age Age 0 0.000 8 0.008 0 0
## Purchased Bike Purchased Bike 0 0.000 0 0.000 0 0
## type unique
## ID numeric 1000
## Marital Status character 2
## Gender character 2
## Income numeric 16
## Children numeric 6
## Education character 5
## Occupation character 5
## Home Owner character 2
## Cars numeric 5
## Commute Distance character 5
## Region character 3
## Age numeric 53
## Purchased Bike character 2
Explaination : This function will show the data types inside the dataset, including missing values in the dataset. By using status I can tell that : - There are 1000 unique intergers in the ID Variable - The variable Children and Cars has the most zeros out of all - The Gender variable has 11 missing interger values.
freq(mydata)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## Marital.Status frequency percentage cumulative_perc
## 1 Married 535 53.5 53.5
## 2 Single 458 45.8 99.3
## 3 <NA> 7 0.7 100.0
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## Gender frequency percentage cumulative_perc
## 1 Male 500 50.0 50.0
## 2 Female 489 48.9 98.9
## 3 <NA> 11 1.1 100.0
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## Education frequency percentage cumulative_perc
## 1 Bachelors 306 30.6 30.6
## 2 Partial College 265 26.5 57.1
## 3 High School 179 17.9 75.0
## 4 Graduate Degree 174 17.4 92.4
## 5 Partial High School 76 7.6 100.0
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## Occupation frequency percentage cumulative_perc
## 1 Professional 276 27.6 27.6
## 2 Skilled Manual 255 25.5 53.1
## 3 Clerical 177 17.7 70.8
## 4 Management 173 17.3 88.1
## 5 Manual 119 11.9 100.0
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## Home.Owner frequency percentage cumulative_perc
## 1 Yes 682 68.2 68.2
## 2 No 314 31.4 99.6
## 3 <NA> 4 0.4 100.0
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## Commute.Distance frequency percentage cumulative_perc
## 1 0-1 Miles 366 36.6 36.6
## 2 5-10 Miles 192 19.2 55.8
## 3 1-2 Miles 169 16.9 72.7
## 4 2-5 Miles 162 16.2 88.9
## 5 10+ Miles 111 11.1 100.0
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## Region frequency percentage cumulative_perc
## 1 North America 508 50.8 50.8
## 2 Europe 300 30.0 80.8
## 3 Pacific 192 19.2 100.0
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## Purchased.Bike frequency percentage cumulative_perc
## 1 No 519 51.9 51.9
## 2 Yes 481 48.1 100.0
## [1] "Variables processed: Marital.Status, Gender, Education, Occupation, Home.Owner, Commute.Distance, Region, Purchased.Bike"
Explaination : This function shows us the percentage or frequency of each variables, including the dataframes too. Some of the data that i’ve read and found is : - The frequency of Marital Status are mostly Married - The most gender that is shown in the data is Male - The most education that is being recorded in the data are Bachelors.
plot_num(mydata)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## qqPlot() Function
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
qqPlot(mydata$Income)
## [1] 13 44
outlierIndex <- which(mydata$Income > 100)
rownames(mydata)[outlierIndex]
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "11"
## [11] "12" "13" "14" "15" "16" "17" "18" "19" "20" "21"
## [21] "22" "23" "24" "25" "26" "27" "28" "29" "30" "31"
## [31] "32" "33" "34" "35" "36" "37" "38" "39" "40" "41"
## [41] "42" "43" "44" "45" "46" "47" "48" "49" "50" "51"
## [51] "52" "53" "54" "55" "56" "57" "58" "59" "60" "61"
## [61] "62" "63" "64" "65" "66" "67" "68" "69" "70" "71"
## [71] "72" "73" "74" "75" "76" "77" "78" "79" "80" "81"
## [81] "82" "83" "84" "85" "86" "87" "88" "89" "90" "91"
## [91] "92" "93" "94" "95" "96" "97" "98" "99" "100" "101"
## [101] "102" "103" "104" "105" "106" "107" "108" "109" "110" "112"
## [111] "113" "114" "115" "116" "117" "118" "119" "120" "121" "122"
## [121] "123" "124" "125" "126" "127" "128" "129" "130" "131" "132"
## [131] "133" "134" "135" "136" "137" "138" "139" "140" "141" "142"
## [141] "143" "144" "145" "146" "147" "148" "149" "150" "151" "152"
## [151] "153" "154" "155" "156" "157" "158" "159" "160" "161" "162"
## [161] "163" "164" "165" "166" "167" "168" "169" "170" "171" "172"
## [171] "173" "174" "175" "176" "177" "178" "179" "180" "181" "182"
## [181] "183" "184" "185" "186" "187" "188" "189" "190" "191" "193"
## [191] "194" "195" "196" "197" "198" "199" "200" "201" "202" "203"
## [201] "204" "205" "206" "207" "208" "209" "210" "211" "212" "213"
## [211] "214" "215" "216" "217" "218" "219" "220" "221" "222" "223"
## [221] "224" "225" "226" "227" "228" "229" "230" "231" "232" "233"
## [231] "234" "235" "236" "237" "238" "239" "240" "241" "242" "243"
## [241] "244" "245" "246" "247" "248" "249" "250" "251" "252" "253"
## [251] "254" "255" "256" "257" "258" "259" "260" "261" "262" "263"
## [261] "264" "265" "266" "267" "268" "269" "270" "271" "272" "273"
## [271] "274" "275" "276" "277" "278" "279" "280" "281" "282" "283"
## [281] "284" "285" "286" "287" "288" "289" "290" "291" "292" "293"
## [291] "294" "295" "296" "297" "298" "299" "300" "301" "303" "304"
## [301] "305" "306" "307" "308" "309" "310" "311" "312" "313" "314"
## [311] "315" "316" "317" "318" "319" "320" "321" "322" "323" "324"
## [321] "325" "326" "327" "328" "329" "330" "331" "332" "333" "334"
## [331] "335" "336" "337" "338" "339" "340" "341" "342" "343" "344"
## [341] "345" "346" "347" "348" "349" "350" "351" "352" "353" "354"
## [351] "355" "356" "357" "358" "359" "360" "361" "362" "363" "364"
## [361] "365" "366" "367" "368" "369" "370" "371" "372" "373" "374"
## [371] "375" "376" "377" "378" "379" "380" "381" "382" "383" "384"
## [381] "385" "386" "387" "388" "389" "390" "391" "392" "393" "394"
## [391] "395" "396" "397" "398" "399" "400" "401" "402" "403" "404"
## [401] "405" "406" "407" "408" "409" "410" "411" "412" "413" "414"
## [411] "415" "416" "417" "418" "419" "420" "421" "422" "423" "424"
## [421] "425" "426" "427" "428" "429" "430" "431" "432" "433" "434"
## [431] "435" "436" "437" "438" "439" "440" "441" "443" "444" "445"
## [441] "446" "447" "448" "449" "450" "451" "452" "453" "454" "455"
## [451] "456" "457" "458" "459" "460" "461" "462" "463" "464" "465"
## [461] "466" "467" "468" "469" "470" "471" "472" "473" "474" "475"
## [471] "476" "477" "478" "479" "480" "481" "482" "483" "484" "485"
## [481] "486" "487" "488" "489" "490" "491" "492" "493" "494" "495"
## [491] "496" "497" "498" "499" "500" "501" "502" "503" "504" "505"
## [501] "506" "507" "508" "509" "511" "512" "513" "514" "515" "516"
## [511] "517" "518" "519" "520" "521" "522" "523" "524" "525" "526"
## [521] "527" "528" "529" "530" "531" "532" "533" "534" "535" "536"
## [531] "537" "538" "539" "540" "541" "542" "543" "544" "545" "546"
## [541] "547" "548" "549" "550" "551" "552" "553" "554" "555" "556"
## [551] "557" "558" "559" "560" "561" "562" "563" "564" "565" "566"
## [561] "567" "568" "569" "570" "571" "572" "573" "574" "575" "576"
## [571] "577" "578" "579" "580" "581" "582" "583" "584" "585" "586"
## [581] "587" "588" "589" "590" "591" "592" "593" "594" "595" "596"
## [591] "597" "598" "599" "600" "601" "602" "603" "604" "605" "606"
## [601] "607" "608" "609" "610" "611" "612" "613" "614" "615" "616"
## [611] "617" "618" "619" "620" "621" "622" "623" "624" "625" "626"
## [621] "627" "628" "629" "630" "631" "632" "633" "634" "635" "636"
## [631] "637" "638" "639" "640" "641" "642" "643" "644" "645" "646"
## [641] "647" "648" "649" "650" "651" "652" "653" "654" "655" "656"
## [651] "657" "658" "659" "660" "661" "662" "663" "664" "665" "666"
## [661] "667" "668" "669" "670" "671" "672" "673" "674" "675" "676"
## [671] "677" "678" "679" "680" "681" "682" "683" "684" "685" "686"
## [681] "687" "688" "689" "690" "691" "692" "693" "694" "695" "696"
## [691] "697" "698" "699" "700" "701" "702" "703" "704" "705" "706"
## [701] "707" "708" "709" "710" "711" "712" "713" "714" "715" "716"
## [711] "717" "718" "719" "720" "721" "722" "723" "724" "725" "726"
## [721] "727" "728" "729" "730" "731" "732" "733" "734" "735" "736"
## [731] "737" "738" "739" "740" "741" "742" "743" "744" "745" "746"
## [741] "747" "748" "749" "750" "751" "752" "753" "754" "755" "756"
## [751] "757" "758" "759" "760" "761" "762" "763" "764" "765" "766"
## [761] "767" "768" "769" "770" "771" "772" "773" "774" "775" "776"
## [771] "777" "778" "779" "780" "781" "782" "783" "784" "785" "786"
## [781] "787" "788" "789" "790" "791" "792" "793" "794" "795" "796"
## [791] "797" "798" "799" "800" "801" "802" "803" "804" "805" "806"
## [801] "807" "808" "809" "810" "811" "812" "813" "814" "815" "816"
## [811] "817" "818" "819" "820" "821" "822" "823" "824" "825" "826"
## [821] "827" "828" "829" "830" "831" "832" "833" "834" "835" "836"
## [831] "837" "838" "839" "840" "841" "842" "843" "844" "845" "846"
## [841] "847" "848" "849" "850" "851" "852" "853" "854" "855" "856"
## [851] "857" "858" "859" "860" "861" "862" "863" "864" "865" "866"
## [861] "867" "868" "869" "870" "871" "872" "873" "874" "875" "876"
## [871] "877" "878" "879" "880" "881" "882" "883" "884" "885" "886"
## [881] "887" "888" "889" "890" "891" "892" "893" "894" "895" "896"
## [891] "897" "898" "899" "900" "901" "902" "903" "904" "905" "906"
## [901] "907" "908" "909" "910" "911" "912" "913" "914" "915" "916"
## [911] "917" "918" "919" "920" "921" "922" "923" "924" "925" "926"
## [921] "927" "928" "929" "930" "931" "932" "933" "934" "935" "936"
## [931] "937" "938" "939" "940" "941" "942" "943" "944" "945" "946"
## [941] "947" "948" "949" "950" "951" "952" "953" "954" "955" "956"
## [951] "957" "958" "959" "960" "961" "962" "963" "964" "965" "966"
## [961] "967" "968" "969" "970" "971" "972" "973" "974" "975" "976"
## [971] "977" "978" "979" "980" "981" "982" "983" "984" "985" "986"
## [981] "987" "988" "989" "990" "991" "992" "993" "994" "995" "996"
## [991] "997" "998" "999" "1000"