Lesson 2
setwd("D:/R/Udacity/L2/EDA_Course_Materials/lesson2")
statesInfo <- read.csv('stateData.csv')
subset(statesInfo, state.region == 1)
## X state.abb state.area state.region population income
## 7 Connecticut CT 5009 1 3100 5348
## 19 Maine ME 33215 1 1058 3694
## 21 Massachusetts MA 8257 1 5814 4755
## 29 New Hampshire NH 9304 1 812 4281
## 30 New Jersey NJ 7836 1 7333 5237
## 32 New York NY 49576 1 18076 4903
## 38 Pennsylvania PA 45333 1 11860 4449
## 39 Rhode Island RI 1214 1 931 4558
## 45 Vermont VT 9609 1 472 3907
## illiteracy life.exp murder highSchoolGrad frost area
## 7 1.1 72.48 3.1 56.0 139 4862
## 19 0.7 70.39 2.7 54.7 161 30920
## 21 1.1 71.83 3.3 58.5 103 7826
## 29 0.7 71.23 3.3 57.6 174 9027
## 30 1.1 70.93 5.2 52.5 115 7521
## 32 1.4 70.55 10.9 52.7 82 47831
## 38 1.0 70.43 6.1 50.2 126 44966
## 39 1.3 71.90 2.4 46.4 127 1049
## 45 0.6 71.64 5.5 57.1 168 9267
stateSubsetBracket <- statesInfo[statesInfo$state.region == 1, ]
head(stateSubsetBracket)
## X state.abb state.area state.region population income
## 7 Connecticut CT 5009 1 3100 5348
## 19 Maine ME 33215 1 1058 3694
## 21 Massachusetts MA 8257 1 5814 4755
## 29 New Hampshire NH 9304 1 812 4281
## 30 New Jersey NJ 7836 1 7333 5237
## 32 New York NY 49576 1 18076 4903
## illiteracy life.exp murder highSchoolGrad frost area
## 7 1.1 72.48 3.1 56.0 139 4862
## 19 0.7 70.39 2.7 54.7 161 30920
## 21 1.1 71.83 3.3 58.5 103 7826
## 29 0.7 71.23 3.3 57.6 174 9027
## 30 1.1 70.93 5.2 52.5 115 7521
## 32 1.4 70.55 10.9 52.7 82 47831
dim(stateSubsetBracket)
## [1] 9 12
reddit <- read.csv("reddit.csv")
table(reddit$employment.status)
##
## Employed full time
## 14814
## Freelance
## 1948
## Not employed and not looking for work
## 682
## Not employed, but looking for work
## 2087
## Retired
## 85
## Student
## 12987
summary(reddit)
## id gender age.range
## Min. : 1 Min. :0.0000 18-24 :15802
## 1st Qu.: 8189 1st Qu.:0.0000 25-34 :11575
## Median :16380 Median :0.0000 Under 18: 2330
## Mean :16379 Mean :0.1885 35-44 : 2257
## 3rd Qu.:24568 3rd Qu.:0.0000 45-54 : 502
## Max. :32756 Max. :1.0000 (Other) : 200
## NA's :201 NA's : 88
## marital.status
## Engaged : 1109
## Forever Alone : 5850
## In a relationship : 9828
## Married/civil union/domestic partnership: 5490
## Single :10428
## Widowed : 44
## NA's : 5
## employment.status military.service
## Employed full time :14814 No :30526
## Freelance : 1948 Yes : 2223
## Not employed and not looking for work: 682 NA's: 5
## Not employed, but looking for work : 2087
## Retired : 85
## Student :12987
## NA's : 151
## children education
## No :27488 Bachelor's degree :11046
## Yes : 5047 Some college : 9600
## NA's: 219 Graduate or professional degree : 4722
## High school graduate or equivalent: 3272
## Some high school : 1924
## (Other) : 2046
## NA's : 144
## country state income.range
## United States :20967 :11908 Under $20,000 :7892
## Canada : 2888 California: 3401 $50,000 - $69,999 :4133
## United Kingdom: 1782 Texas : 1541 $70,000 - $99,999 :4101
## Australia : 1051 New York : 1418 $100,000 - $149,999:3522
## Germany : 407 Illinois : 976 $20,000 - $29,999 :3206
## (Other) : 5482 Washington: 910 (Other) :8285
## NA's : 177 (Other) :12600 NA's :1615
## fav.reddit dog.cat cheese
## : 4335 I like cats. :11156 Other :6563
## askreddit : 2123 I like dogs. :17151 Cheddar :6102
## fffffffuuuuuuuuuuuu: 1746 I like turtles.: 4442 Brie :3742
## pics : 1651 NA's : 5 Provolone:3456
## trees : 1311 Swiss :3214
## (Other) :21562 (Other) :9672
## NA's : 26 NA's : 5
levels(reddit$age.range)
## [1] "18-24" "25-34" "35-44" "45-54" "55-64"
## [6] "65 or Above" "Under 18"
library(ggplot2)
qplot(data = reddit, x = age.range)

# Order the factor levels in the age.range variable in order to create
# a graph with a natural order. Look up the documentation for
# the factor function or read through the example in the Instructor Notes.
# Once you're ready, try to write the code to order the levels of
# the age.range variable.
# Be sure you modify the variable in the data frame. That is modify reddit$age.range.
# Don't create a new variable.
# The levels of age.range should take on these values...
# "Under 18", "18-24", "25-34", "35-44", "45-54", "55-64", "65 or Above"
# This exercise is ungraded. You can check your own work by using the Test Run
# button. Your plot will appear there.
library(ggplot2)
reddit$age.range <- ordered(reddit$age.range, levels = c("Under 18", "18-24", "25-34", "35-44", "45-54", "55-64", "65, Above"))
qplot(data = reddit, x = age.range)
