Summarizing Mathematically a Data Frame
summary(data[monthIsMay == TRUE,])
Reading a CSV File into a data Frame:
data <- read.csv("hw1_data.csv")
data[1:2,]
Adding all Rows based on a Column Condition:
aggregateEmissionsValuesByYearDataSet <- aggregate(Emissions ~ year, data = NEI, FUN = sum)
Histogram:
hist(dailyStepData$Steps, xlab = "Number of Steps", ylab = "Number of Days", main = "Histogram of Number of Steps Daily")
Using rbinom:
rbinom(1,5,1/6) ### rbinom(number of experiments, number of observations per experiment, probability of success)
isNaOzone = is.na(ozone)
summary(data)
over31Ozone = data$Ozone > 31
data2 <- data[over31Ozone == TRUE,]
data4 <- data[monthIsJune == TRUE]
mdhIsNa[is.na(mdhIsNa)] <- FALSE
Plotting a Simple Graph with a AB Line:
with(aggregateEmissionsValuesByYearDataSet, plot(year, Emissions), main = "Total Emissions by Year", pch = 20)
model <- lm(Emissions ~ year, aggregateEmissionsValuesByYearDataSet)
abline(model, lwd = 2)
Printing a Plot to a PNG File:
png(file = "myplot.png", bg = "transparent")
plot(1:10)
rect(1, 5, 3, 7, col = "white")
dev.off()
Getting the Sum of the Columns:
colSums(x[c(4,9)])
Plotting: Printing A Plot with Five Panels for Months using ggplot2:
airquality = transform(airquality, Month = factor(Month))
qplot(Wind, Ozone, data = airquality, facets = . ~ Month)
Building a Matrix in R:
B = matrix(c(2, 4, 3, 1, 5, 7, 6, 5, 9), nrow=3, ncol=3)
Reading RDS Files:
NEI <- readRDS("summarySCC_PM25.rds")
SCC <- readRDS("Source_Classification_Code.rds")
Processing Fixed Width Columns:
x <- read.fwf(
file="fwff.txt",
skip=4,
widths=c(12, 7,4, 9,4, 9,4, 9,4))
Setting an Objects Column Names from a data table:
colnames(x_test_data) <- x_test_data_titles[,2]
Getting the Adjoining Values From Two Columns:
both <- intersect(eduData$CountryCode, gdpData$CountryCode)
Downloading a file from a URL:
download.file(fileURL, destfile="idahoHousing.csv", method="curl")
Merging Data Frame:
completeDataSet <- merge(gdpData,eduData,by="CountryCode")
Stripping Characters from a Vector:
group <- c("12357e", "12575e", "197e18", "e18947")
v <- gsub("e", "", group)
Setting Column Names:
names(gdpData) <- c("CountryCode", "CountryNumberCode", "V3", "FullCountryName", "GDP", "V6", "V7", "V8", "V9", "V10")
Removing All Variables from the Current Environment Except Functions:
rm(list = setdiff(ls(), lsf.str()))
Ordering a Data Frame:
completeOrderedDataSet <- completeDataSet[with(completeDataSet, order(GDPRank, decreasing = FALSE)), ]
Downloading a JPEG:
fileURL <- "https://d396qusza40orc.cloudfront.net/getdata%2Fjeff.jpg"
download.file(fileURL, destfile="instructorPhoto.jpg", method="curl")
install.packages("jpeg")
install(jpeg)
img <- readJPEG("instructorPhoto.jpg", native = TRUE)
Getting a Quantile from an Image:
quantile(img, c(0.3, 0.8), type = 1)
Installing a Package:
install.packages("xlsx")
Using XML/RCurl to extract data
install.packages("RCurl")
library(RCurl)
xData <- getURL(fileURL)
doc <- xmlParse(xData)
rootNode <- xmlRoot(doc)
xpathSApply(rootNode, "//zipcode", xmlValue)
zips <- xpathSApply(rootNode, "//zipcode", xmlValue)
zips[zips == "21231"]
Unzipping a zip file:
unzip("galaxyProject1.zip");
Changing the Working Directory:
setwd("./UCI HAR Dataset");
Setting Working Directory:
getwd();
Listing all the Files in a Directory:
list.files()
Clearing all data from the terminal:
cat("\014")
Removing Variables From Environment:
remove(activity_labels_data, subject_test_data, test_table1, x_test_data, x_test_data_titles, y_test_data)
Subsetting a Data Frame:
stateTFSubset = outcome$State == state
stateSubset = outcome[stateTFSubset == TRUE,]
LAAutoTFDataSet = carNEIDataSet$fips == "06037"
Getting the Cummulative Variance From PCA with Carat:
adData = data.frame(diagnosis,predictors)
inTrain = createDataPartition(adData$diagnosis, p = 3/4)[[1]]
training = adData[ inTrain,]
testing = adData[-inTrain,]
training2 <- training[, c(1, 58:69)]
preProc <- preProcess(training2[,-1], method = "pca", pcaComp = 10)
stds <- preProc$std
stds[1]^2 / sum(stds^2) + stds[2]^2 / sum(stds^2) + stds[3]^2 / sum(stds^2) + stds[4]^2 / sum(stds^2) + stds[5]^2 / sum(stds^2) + stds[6]^2 / sum(stds^2) + stds[7]^2 / sum(stds^2) + stds[8]^2 / sum(stds^2) + stds[9]^2 / sum(stds^2)
Getting the Accuracy of a PCA based Prediction:
testing2 <- testing[, c(1, 58:69)]
testPC <- predict(preProc, testing2[,-1])
confusionMatrix(testing2$diagnosis, predict(modelFit, testPC))
Getting the Cummulative Variance From PCA with prcomp:
training2prcomp <- prcomp(training2)
summary(training2prcomp)
Replacing Factors with Numbers:
training2$diagnosis <- ifelse(training2$diagnosis == "Impaired", 2, 1)
Running an R Script:
source("run_analysis.R")
getting the first to rows from a column and the 1st, 2nd, 3rd, 4th, and 5th columns with the titles:
complete_table[1:2, c(1:2, 3:5), with=FALSE]
Replacing All Values in a Table (numeric but it shouldn't matter) with Corresponding Text Values from another Table:
labeled_table2[, activity_code:=activity_labels_data[activity_code,2]]
Printing a Specific Set of Values from a Table:
labeled_table[1:10,c(1:2), with=FALSE]
Plotting a Variable in qplot against it's index:
qplot(seq_along(CompressiveStrength), CompressiveStrength, data = training)
Plotting a Variable in qplot with Colors against it's index:
cutCoarseAggregate <- cut2(training$CoarseAggregate, g=4)
qplot(seq_along(CompressiveStrength), CompressiveStrength, color= cutCoarseAggregate,data = training)
How to combine two different tables data (not based on a key):
DT1<-data.table(col1=sample(letters,5,replace=TRUE),col2=sample(LETTERS[1:5],5,replace=TRUE),col3=sample(1:2,5,replace=TRUE))
DT2<-data.table(col4=sample(1:3,10,replace=TRUE),col5=sample(LETTERS[1:5],10,replace=TRUE),col6=sample(1:100,10,replace=TRUE))
(DT1)
(DT2)