#Importing
agland <- read.csv("https://raw.githubusercontent.com/davidblumenstiel/data/master/Agricultural%20Land%20Values.csv")
head(agland)
## Region.and.State X2015 X2016 X2017 X2018 X2019 Change.2018.2019
## 1 Northeast ..........: 5,190 5,270 5,380 5,550 5,690 2.5
## 2 Connecticut ......: 11,700 11,900 12,100 12,300 12,200 -0.8
## 3 Delaware .........: 8,110 8,290 8,250 8,410 8,950 6.4
## 4 Maine ............: 2,170 2,210 2,370 2,370 2,410 1.7
## 5 Maryland .........: 7,270 7,470 7,620 7,860 8,060 2.5
## 6 Massachusetts ....: 10,600 10,700 10,800 10,900 11,100 1.8
#Renaming columns
names(agland)[1:7] <- c("Region/State","2015","2016","2017","2018","2019","%Change2018-2019")
#Getting rid of blank rows
agland <- na.omit(agland)
#Reformatting columns to their appropriate types
#Need to remove the '.....'s from Column 1
agland[,1] <- gsub("\\.*:","",as.character(agland[,1]))
#Changing the value by year columns to numeric; also removing the ','s
agland[,2] <- as.numeric(gsub(",","",as.character(agland[,2])))
agland[,3] <- as.numeric(gsub(",","",as.character(agland[,3])))
agland[,4] <- as.numeric(gsub(",","",as.character(agland[,4])))
agland[,5] <- as.numeric(gsub(",","",as.character(agland[,5])))
agland[,6] <- as.numeric(gsub(",","",as.character(agland[,6])))
head(agland)
## Region/State 2015 2016 2017 2018 2019 %Change2018-2019
## 1 Northeast 5190 5270 5380 5550 5690 2.5
## 2 Connecticut 11700 11900 12100 12300 12200 -0.8
## 3 Delaware 8110 8290 8250 8410 8950 6.4
## 4 Maine 2170 2210 2370 2370 2410 1.7
## 5 Maryland 7270 7470 7620 7860 8060 2.5
## 6 Massachusetts 10600 10700 10800 10900 11100 1.8
The year columns would make an easy target for tidying.
agland <- gather(agland,"Year","Value",2:6)
head(agland)
## Region/State %Change2018-2019 Year Value
## 1 Northeast 2.5 2015 5190
## 2 Connecticut -0.8 2015 11700
## 3 Delaware 6.4 2015 8110
## 4 Maine 1.7 2015 2170
## 5 Maryland 2.5 2015 7270
## 6 Massachusetts 1.8 2015 10600
#Easy to do with dplyr
medAgland <- agland %>%
group_by(`Region/State`) %>%
summarize(Median_Value = median(Value))
head(medAgland)
## # A tibble: 6 x 2
## `Region/State` Median_Value
## <chr> <dbl>
## 1 "Appalachian " 3970
## 2 "Connecticut " 12100
## 3 "Corn Belt " 6100
## 4 "Delaware " 8290
## 5 "Illinois " 7280
## 6 "Indiana " 6580
#Now what we have the median values, we can use ggplot2 to make a nice plot
ggplot(medAgland, aes(x = reorder(`Region/State`,Median_Value), y = Median_Value, fill = `Region/State`)) +
coord_flip() +
geom_col() +
ggtitle("Median Land Value Across All Years")+
xlab("Median Value") +
ylab("State/Region") +
theme(legend.position = "none")
Rhode Island has the highest median land value ($14900), and North Dakota has the lowest (1740)
#Good time to use box plots
ggplot(agland,aes(x = Year, y = Value, fill = Year)) +
geom_boxplot() +
scale_fill_brewer(palette=1)+
ggtitle("Land Value by Year")
Each of the years are pretty similar in terms of land value, with a slight increase per year. One interesting observation is that the IQR is tightening as time goes on.
#Importing
stuper <- read.csv("https://gist.githubusercontent.com/Kimmirikwa/b69d0ea134820ea52f8481991ffae93e/raw/4db7b1698035ee29885d10e1a59bd902716ae168/student_results.csv")
stuper
## id name phone sex.and.age test.number term.1 term.2 term.3
## 1 1 Mike 134 m_12 test 1 76 84 87
## 2 2 Linda 270 f_13 test 1 88 90 73
## 3 3 Sam 210 m_11 test 1 78 74 80
## 4 4 Esther 617 f_12 test 1 68 75 74
## 5 5 Mary 114 f_14 test 1 65 67 64
## 6 1 Mike 134 m_12 test 2 85 80 90
## 7 2 Linda 270 f_13 test 2 87 82 94
## 8 3 Sam 210 m_11 test 2 80 87 80
## 9 4 Esther 617 f_12 test 2 70 75 78
## 10 5 Mary 114 f_14 test 2 68 70 63
This dataset arrives much cleaner than the last one, however the term columns could be combined, and the sex and age column can be seperated into 2
#Spliting the sex.and.age column with dplyr: separate
stuper <- stuper %>%
separate(sex.and.age,c("sex","age"),"_")
stuper
## id name phone sex age test.number term.1 term.2 term.3
## 1 1 Mike 134 m 12 test 1 76 84 87
## 2 2 Linda 270 f 13 test 1 88 90 73
## 3 3 Sam 210 m 11 test 1 78 74 80
## 4 4 Esther 617 f 12 test 1 68 75 74
## 5 5 Mary 114 f 14 test 1 65 67 64
## 6 1 Mike 134 m 12 test 2 85 80 90
## 7 2 Linda 270 f 13 test 2 87 82 94
## 8 3 Sam 210 m 11 test 2 80 87 80
## 9 4 Esther 617 f 12 test 2 70 75 78
## 10 5 Mary 114 f 14 test 2 68 70 63
#Combining the term columns with tidyr: gather
stuper <- gather(stuper,"term","score",7:9)
head(stuper)
## id name phone sex age test.number term score
## 1 1 Mike 134 m 12 test 1 term.1 76
## 2 2 Linda 270 f 13 test 1 term.1 88
## 3 3 Sam 210 m 11 test 1 term.1 78
## 4 4 Esther 617 f 12 test 1 term.1 68
## 5 5 Mary 114 f 14 test 1 term.1 65
## 6 1 Mike 134 m 12 test 2 term.1 85
#Can make a new output dataframe for clarification:
#One row per student per test
stuimp <- data.frame(matrix(data = NA,nrow = 10,ncol=3))
names(stuimp) <- c("Student","Test Number","Ratio of Improvment Between Terms 1 and 3")
#Takes student names from the tidy dataset
stuimp$Student <- as.character(stuper$name[1:10])
#Makes a vector of the percent change ia student's score for test 1, and appends it to it's new column int he output dataframe
testImprove <- (stuper$score[stuper$term == "term.3" & stuper$test.number == "test 1"] -
stuper$score[stuper$term == "term.1" & stuper$test.number == "test 1"]) /
stuper$score[stuper$term == "term.3" & stuper$test.number == "test 1"]
#Does the same thing for test 2 and adds it to the vector above
testImprove <- c(testImprove,(stuper$score[stuper$term == "term.3" & stuper$test.number == "test 2"] -
stuper$score[stuper$term == "term.1" & stuper$test.number == "test 2"]) /
stuper$score[stuper$term == "term.3" & stuper$test.number == "test 2"])
#Appends testImprove to the output dataframe
stuimp$`Ratio of Improvment Between Terms 1 and 3` <- testImprove
#Adds the test number
stuimp$`Test Number` <- c(rep(1,5),rep(2,5))
stuimp
## Student Test Number Ratio of Improvment Between Terms 1 and 3
## 1 Mike 1 0.12643678
## 2 Linda 1 -0.20547945
## 3 Sam 1 0.02500000
## 4 Esther 1 0.08108108
## 5 Mary 1 -0.01562500
## 6 Mike 2 0.05555556
## 7 Linda 2 0.07446809
## 8 Sam 2 0.00000000
## 9 Esther 2 0.10256410
## 10 Mary 2 -0.07936508
The output is a small, tidy table of only the student changes in performance
#Lets also determine the median performance change for all students on each test
medperform1 <- median(stuimp$`Ratio of Improvment Between Terms 1 and 3`[stuimp$`Test Number` == 1])
medperform2 <- median(stuimp$`Ratio of Improvment Between Terms 1 and 3`[stuimp$`Test Number` == 2])
#Test 1
barplot(100 * c(stuimp$`Ratio of Improvment Between Terms 1 and 3`[stuimp$`Test Number` == 1], medperform1),names =
c(stuimp$Student[stuimp$`Test Number` == 1],"Median"),xlab = "Student", ylab = "% Performance Change",main = "Test 1 Improvements",col=rgb(0.5,0.2,0.1,0.9))
#Test 2
barplot(100 * c(stuimp$`Ratio of Improvment Between Terms 1 and 3`[stuimp$`Test Number` == 2], medperform2),names =
c(stuimp$Student[stuimp$`Test Number` == 2],"Median"),xlab = "Student", ylab = "% Performance Change",main = "Test 2 Improvements",col=rgb(0.1,0.0,0.6,0.2))
Overall, students’ median performance increased on both tests, but more on test 2. Linda and mary were the only ones who performed signifcantly worse on the later tests (1 and 2 respectively).
bobross <-read.csv("https://raw.githubusercontent.com/davidblumenstiel/data/master/bobross.csv")
head(bobross)
## EPISODE TITLE APPLE_FRAME AURORA_BOREALIS BARN BEACH BOAT
## 1 S01E01 "A WALK IN THE WOODS" 0 0 0 0 0
## 2 S01E02 "MT. MCKINLEY" 0 0 0 0 0
## 3 S01E03 "EBONY SUNSET" 0 0 0 0 0
## 4 S01E04 "WINTER MIST" 0 0 0 0 0
## 5 S01E05 "QUIET STREAM" 0 0 0 0 0
## 6 S01E06 "WINTER MOON" 0 0 0 0 0
## BRIDGE BUILDING BUSHES CABIN CACTUS CIRCLE_FRAME CIRRUS CLIFF CLOUDS CONIFER
## 1 0 0 1 0 0 0 0 0 0 0
## 2 0 0 0 1 0 0 0 0 1 1
## 3 0 0 0 1 0 0 0 0 0 1
## 4 0 0 1 0 0 0 0 0 1 1
## 5 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 1 0 0 0 0 0 1
## CUMULUS DECIDUOUS DIANE_ANDRE DOCK DOUBLE_OVAL_FRAME FARM FENCE FIRE
## 1 0 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 1 0
## 4 0 0 0 0 0 0 0 0
## 5 0 1 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0
## FLORIDA_FRAME FLOWERS FOG FRAMED GRASS GUEST HALF_CIRCLE_FRAME
## 1 0 0 0 0 1 0 0
## 2 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0
## HALF_OVAL_FRAME HILLS LAKE LAKES LIGHTHOUSE MILL MOON MOUNTAIN MOUNTAINS
## 1 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 1 0
## 3 0 0 0 0 0 0 0 1 1
## 4 0 0 1 0 0 0 0 1 0
## 5 0 0 0 0 0 0 0 0 0
## 6 0 0 1 0 0 0 1 1 1
## NIGHT OCEAN OVAL_FRAME PALM_TREES PATH PERSON PORTRAIT RECTANGLE_3D_FRAME
## 1 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0
## 6 1 0 0 0 0 0 0 0
## RECTANGULAR_FRAME RIVER ROCKS SEASHELL_FRAME SNOW SNOWY_MOUNTAIN SPLIT_FRAME
## 1 0 1 0 0 0 0 0
## 2 0 0 0 0 1 1 0
## 3 0 0 0 0 0 0 0
## 4 0 0 0 0 0 1 0
## 5 0 1 1 0 0 0 0
## 6 0 0 0 0 1 1 0
## STEVE_ROSS STRUCTURE SUN TOMB_FRAME TREE TREES TRIPLE_FRAME WATERFALL WAVES
## 1 0 0 0 0 1 1 0 0 0
## 2 0 0 0 0 1 1 0 0 0
## 3 0 1 1 0 1 1 0 0 0
## 4 0 0 0 0 1 1 0 0 0
## 5 0 0 0 0 1 1 0 0 0
## 6 0 1 0 0 1 1 0 0 0
## WINDMILL WINDOW_FRAME WINTER WOOD_FRAMED
## 1 0 0 0 0
## 2 0 0 1 0
## 3 0 0 1 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 1 0
This has way to many columns; luckily they all have the same kind of observations, and would be easy to tidy.
#Going to use pivot_gather here; pretty much the same thing
bobross <- bobross %>%
pivot_longer(-c("EPISODE","TITLE"), names_to = "Feature", values_to = "Included")
head(bobross)
## # A tibble: 6 x 4
## EPISODE TITLE Feature Included
## <fct> <fct> <chr> <int>
## 1 S01E01 "\"A WALK IN THE WOODS\"" APPLE_FRAME 0
## 2 S01E01 "\"A WALK IN THE WOODS\"" AURORA_BOREALIS 0
## 3 S01E01 "\"A WALK IN THE WOODS\"" BARN 0
## 4 S01E01 "\"A WALK IN THE WOODS\"" BEACH 0
## 5 S01E01 "\"A WALK IN THE WOODS\"" BOAT 0
## 6 S01E01 "\"A WALK IN THE WOODS\"" BRIDGE 0
Much easier to read now, although our 400 observations have turned into 27000.
#Credit where credit is due: Leo Yi assisted greatly with this chunk
group_by(bobross,Feature) %>%
summarize(Occurances = sum(Included)) %>%
ggplot(aes(x = Feature, y = Occurances, fill = Feature)) +
geom_col() +
theme_linedraw() +
theme(legend.position = "none",axis.text=element_text(size=10),
axis.title=element_text(size=20)) +
coord_flip() +
ggtitle("Number of Features Uses Across All Episodes")+
theme(plot.title = element_text(size=15))
It appears that tree(s) are the most common features in his paintings, followed by clouds if one discounts the seperate types of trees.
fire <- bobross %>%
filter(Feature == 'FIRE') %>%
filter(Included == 1)
fire[,1:2]
## # A tibble: 1 x 2
## EPISODE TITLE
## <fct> <fct>
## 1 S03E10 "\"CAMPFIRE\""
bobross %>%
filter(EPISODE == "S03E10") %>%
filter(Included == 1)
## # A tibble: 7 x 4
## EPISODE TITLE Feature Included
## <fct> <fct> <chr> <int>
## 1 S03E10 "\"CAMPFIRE\"" BUSHES 1
## 2 S03E10 "\"CAMPFIRE\"" DECIDUOUS 1
## 3 S03E10 "\"CAMPFIRE\"" FIRE 1
## 4 S03E10 "\"CAMPFIRE\"" LAKE 1
## 5 S03E10 "\"CAMPFIRE\"" PERSON 1
## 6 S03E10 "\"CAMPFIRE\"" TREE 1
## 7 S03E10 "\"CAMPFIRE\"" TREES 1
We can probably say that the painting in this episode includes a person outdoors by the fire with a lake nearby, likely in some wooded setting.