use <- to assign a value to a variable
# Numbers
dusp_course <- 11
# Text
city_name <- "Cambridge"
# Boolean (true or false)
sanctuary_city <- TRUE
Putting a variable on a line by itself will display the contents of that variable no matter what type it is.
dusp_course
## [1] 11
city_name
## [1] "Cambridge"
sanctuary_city
## [1] TRUE
# A traditional mathematical function, square root
# First, just perform the function on a number and display the results
sqrt(27)
## [1] 5.196152
# Storing a number into a variable. Nothing gets displayed, but it is storedf or future use.
x <- 27
# We can now display what is in that variable
x
## [1] 27
# Using that stored variable, perform the function on it
sqrt(x)
## [1] 5.196152
# Now repeat, but store the result
y <- sqrt(x)
# And display the result
y
## [1] 5.196152
# We still have x from before
x
## [1] 27
# multiply by 3, and store in variable y. The previous value is overwritten.
y <- x * 3
y
## [1] 81
# Put the results of a calculation into the variable x
x <- 308 * 0.415
x
## [1] 127.82
# Round to the nearest whole number (0 digits from the decimal point)
d <- 0
y <- round(x, digits=d)
y
## [1] 128
# We have this from before, but it is stored as a number "type"
dusp_course
## [1] 11
# Change it into text "type", and store it into a new variable
dusp_course2 <- as.character(dusp_course)
dusp_course2
## [1] "11"
Note: The paste function automatically adds spaces while paste0' does not. I prefer to always usepaste0` and manually put in the spaces if needed.
# We have this from before
dusp_course
## [1] 11
dusp_text <- "At MIT, DUSP is course:"
fulltext <- paste0(dusp_text,dusp_course)
fulltext
## [1] "At MIT, DUSP is course:11"
# Now with a space
fulltext <- paste0(dusp_text,' ',dusp_course)
fulltext
## [1] "At MIT, DUSP is course: 11"
# Making a list of numbers
x <- c(5,3,8,7,3)
x
## [1] 5 3 8 7 3
# Making a list of states
states <- c("MA","UT","MD","OH","ND")
states
## [1] "MA" "UT" "MD" "OH" "ND"
# You can ask for a specific item in your list, here the third item
states[3]
## [1] "MD"
# we have x from before
x
## [1] 5 3 8 7 3
# Multiply each item by 10
store <- x*10
store
## [1] 50 30 80 70 30
seq function# The colon says make a list of number between 2 and 10
t2 <- c(2,3,4,5,6,7,8,9,10)
t <- 2:10
t2
## [1] 2 3 4 5 6 7 8 9 10
t
## [1] 2 3 4 5 6 7 8 9 10
# Do it manually
w <- c(2,4,6,8)
w
## [1] 2 4 6 8
# Now using the sequence function
w2 <- seq(2,1000,by=2)
w2
## [1] 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30
## [16] 32 34 36 38 40 42 44 46 48 50 52 54 56 58 60
## [31] 62 64 66 68 70 72 74 76 78 80 82 84 86 88 90
## [46] 92 94 96 98 100 102 104 106 108 110 112 114 116 118 120
## [61] 122 124 126 128 130 132 134 136 138 140 142 144 146 148 150
## [76] 152 154 156 158 160 162 164 166 168 170 172 174 176 178 180
## [91] 182 184 186 188 190 192 194 196 198 200 202 204 206 208 210
## [106] 212 214 216 218 220 222 224 226 228 230 232 234 236 238 240
## [121] 242 244 246 248 250 252 254 256 258 260 262 264 266 268 270
## [136] 272 274 276 278 280 282 284 286 288 290 292 294 296 298 300
## [151] 302 304 306 308 310 312 314 316 318 320 322 324 326 328 330
## [166] 332 334 336 338 340 342 344 346 348 350 352 354 356 358 360
## [181] 362 364 366 368 370 372 374 376 378 380 382 384 386 388 390
## [196] 392 394 396 398 400 402 404 406 408 410 412 414 416 418 420
## [211] 422 424 426 428 430 432 434 436 438 440 442 444 446 448 450
## [226] 452 454 456 458 460 462 464 466 468 470 472 474 476 478 480
## [241] 482 484 486 488 490 492 494 496 498 500 502 504 506 508 510
## [256] 512 514 516 518 520 522 524 526 528 530 532 534 536 538 540
## [271] 542 544 546 548 550 552 554 556 558 560 562 564 566 568 570
## [286] 572 574 576 578 580 582 584 586 588 590 592 594 596 598 600
## [301] 602 604 606 608 610 612 614 616 618 620 622 624 626 628 630
## [316] 632 634 636 638 640 642 644 646 648 650 652 654 656 658 660
## [331] 662 664 666 668 670 672 674 676 678 680 682 684 686 688 690
## [346] 692 694 696 698 700 702 704 706 708 710 712 714 716 718 720
## [361] 722 724 726 728 730 732 734 736 738 740 742 744 746 748 750
## [376] 752 754 756 758 760 762 764 766 768 770 772 774 776 778 780
## [391] 782 784 786 788 790 792 794 796 798 800 802 804 806 808 810
## [406] 812 814 816 818 820 822 824 826 828 830 832 834 836 838 840
## [421] 842 844 846 848 850 852 854 856 858 860 862 864 866 868 870
## [436] 872 874 876 878 880 882 884 886 888 890 892 894 896 898 900
## [451] 902 904 906 908 910 912 914 916 918 920 922 924 926 928 930
## [466] 932 934 936 938 940 942 944 946 948 950 952 954 956 958 960
## [481] 962 964 966 968 970 972 974 976 978 980 982 984 986 988 990
## [496] 992 994 996 998 1000
# Count the number of items in a vector
l <- length(x)
l
## [1] 5
# calculate the average (mean) of the values
m <- mean(x)
m
## [1] 5.2
# Calculate the total
x
## [1] 5 3 8 7 3
# Calculate the total
t <- sum(x)
t
## [1] 26
# Calculate the percent of the total of each value in the vector
p <- x/t
p
## [1] 0.1923077 0.1153846 0.3076923 0.2692308 0.1153846
# we have the variables x and states from before. There needs
#to be the same number of items in both vectors.
x
## [1] 5 3 8 7 3
# assign the states as names to the items in the vector. In essence column headers but it doesn't effect the numbers themselves.
names(x) <- states
x
## MA UT MD OH ND
## 5 3 8 7 3
# Now that the values have names associated with them, they
#will show up on the x-axis as labels.
barplot(x)
# Now with labels
barplot(x, xlab="State", ylab="# of students from this state", main="Where are they from?")
# REPEAT BUT MAKE HORIZONTAL AND CHANGE COLOR
# Google "r barplot" for help
# Rivers is a built-in dataset which is a vector of numbers:
#The lengths (in miles) of 141 "major" rivers in North America,
#as compiled by the US Geological Survey.
rivers
## [1] 735 320 325 392 524 450 1459 135 465 600 330 336 280 315 870
## [16] 906 202 329 290 1000 600 505 1450 840 1243 890 350 407 286 280
## [31] 525 720 390 250 327 230 265 850 210 630 260 230 360 730 600
## [46] 306 390 420 291 710 340 217 281 352 259 250 470 680 570 350
## [61] 300 560 900 625 332 2348 1171 3710 2315 2533 780 280 410 460 260
## [76] 255 431 350 760 618 338 981 1306 500 696 605 250 411 1054 735
## [91] 233 435 490 310 460 383 375 1270 545 445 1885 380 300 380 377
## [106] 425 276 210 800 420 350 360 538 1100 1205 314 237 610 360 540
## [121] 1038 424 310 300 444 301 268 620 215 652 900 525 246 360 529
## [136] 500 720 270 430 671 1770
# You should think of a "dataframe" as as an excel spreadsheet
#with columns and rows
mydata <- data.frame(rivers)
nrow(mydata)
## [1] 141
# display the first six rows
head(mydata)
## rivers
## 1 735
## 2 320
## 3 325
## 4 392
## 5 524
## 6 450
# display the first ten rows
head(mydata,10)
## rivers
## 1 735
## 2 320
## 3 325
## 4 392
## 5 524
## 6 450
## 7 1459
## 8 135
## 9 465
## 10 600
# display the last six rows
tail(mydata)
## rivers
## 136 500
## 137 720
## 138 270
## 139 430
## 140 671
## 141 1770
summary(mydata)
## rivers
## Min. : 135.0
## 1st Qu.: 310.0
## Median : 425.0
## Mean : 591.2
## 3rd Qu.: 680.0
## Max. :3710.0
# Send one column to the histogram function, let the function
#determine the number of buckets itself
hist(mydata$rivers)
#Choose the number of buckets yourself
hist(mydata$rivers, breaks=4)
# Create a new column with kilometers instead of miles
mydata$riverskm <- mydata$rivers * 1.609
head(mydata)
## rivers riverskm
## 1 735 1182.615
## 2 320 514.880
## 3 325 522.925
## 4 392 630.728
## 5 524 843.116
## 6 450 724.050
# The water quality for each river is evaluated on a scale of
#1 to 100. Let's assign a random value for each river. We
#need 141 random numbers.
s <- sample(1:100,141,replace=T)
s
## [1] 30 43 99 26 96 29 73 66 68 60 32 83 33 32 72 33 61 35
## [19] 13 90 54 56 66 16 44 48 36 87 36 3 35 18 20 77 87 84
## [37] 3 47 77 79 53 60 92 37 38 38 94 33 50 3 26 11 36 3
## [55] 59 96 64 85 76 15 77 2 63 61 5 58 54 97 58 23 50 88
## [73] 16 32 85 41 47 89 58 66 20 54 94 54 38 70 65 28 40 92
## [91] 17 47 39 91 99 70 12 36 30 86 32 86 11 45 68 26 77 93
## [109] 85 37 67 93 12 20 55 64 27 66 42 70 24 44 81 97 66 53
## [127] 81 8 68 100 37 86 15 78 67 22 1 84 98 40 36
# Make a new column in the dataframe with water quality
mydata$quality <- s
head(mydata)
## rivers riverskm quality
## 1 735 1182.615 30
## 2 320 514.880 43
## 3 325 522.925 99
## 4 392 630.728 26
## 5 524 843.116 96
## 6 450 724.050 29
We want to investigate whether there is a correlation between the length of a river and its water quality. To do that, we want to make a scatter plot of the river lengths and the water quality.
plot(mydata$rivers,mydata$quality)
# Regression line (NOTE: it's y ~ x)
myline <- lm(mydata$quality ~ mydata$rivers)
# Add the line to the plot
abline(myline, col="red")