Monday, December 8, 2014

R Code Snippets

Summarizing Mathematically a Data Frame

summary(data[monthIsMay == TRUE,])

Reading a CSV File into a data Frame:

data <- read.csv("hw1_data.csv")

data[1:2,]

Adding all Rows based on a Column Condition:

aggregateEmissionsValuesByYearDataSet <- aggregate(Emissions ~ year, data = NEI, FUN = sum)

Histogram:


hist(dailyStepData$Steps, xlab = "Number of Steps", ylab = "Number of Days", main = "Histogram of Number of Steps Daily")

Using rbinom:

rbinom(1,5,1/6)    ### rbinom(number of experiments, number of observations per experiment, probability of success)

isNaOzone = is.na(ozone)

summary(data)

over31Ozone = data$Ozone > 31

data2 <- data[over31Ozone == TRUE,]

data4 <- data[monthIsJune == TRUE]

mdhIsNa[is.na(mdhIsNa)] <- FALSE

Plotting a Simple Graph with a AB Line:

with(aggregateEmissionsValuesByYearDataSet, plot(year, Emissions), main = "Total Emissions by Year", pch = 20)
model <- lm(Emissions ~ year, aggregateEmissionsValuesByYearDataSet)
abline(model, lwd = 2)

Printing a Plot to a PNG File:

png(file = "myplot.png", bg = "transparent")
plot(1:10)
rect(1, 5, 3, 7, col = "white")
dev.off()

Getting the Sum of the Columns:

colSums(x[c(4,9)])

Plotting: Printing A Plot with Five Panels for Months using ggplot2:

airquality = transform(airquality, Month = factor(Month))
qplot(Wind, Ozone, data = airquality, facets = . ~ Month)

Building a Matrix in R:

B = matrix(c(2, 4, 3, 1, 5, 7, 6, 5, 9), nrow=3, ncol=3)

Reading RDS Files:

NEI <- readRDS("summarySCC_PM25.rds")
SCC <- readRDS("Source_Classification_Code.rds")

Processing Fixed Width Columns:

x <- read.fwf(
    file="fwff.txt",
    skip=4,
    widths=c(12, 7,4, 9,4, 9,4, 9,4))

Setting an Objects Column Names from a data table:

colnames(x_test_data) <- x_test_data_titles[,2]

Getting the Adjoining Values From Two Columns:

both <- intersect(eduData$CountryCode, gdpData$CountryCode)

Downloading a file from a URL:

download.file(fileURL, destfile="idahoHousing.csv", method="curl")

Merging Data Frame:

completeDataSet <- merge(gdpData,eduData,by="CountryCode")

Stripping Characters from a Vector:

group <- c("12357e", "12575e", "197e18", "e18947") 
v <- gsub("e", "", group)

Setting Column Names:

names(gdpData) <- c("CountryCode", "CountryNumberCode", "V3", "FullCountryName", "GDP", "V6", "V7", "V8", "V9", "V10")

Removing All Variables from the Current Environment Except Functions:

rm(list = setdiff(ls(), lsf.str()))

Ordering a Data Frame:

completeOrderedDataSet <- completeDataSet[with(completeDataSet, order(GDPRank, decreasing = FALSE)), ]

Downloading a JPEG:

fileURL <- "https://d396qusza40orc.cloudfront.net/getdata%2Fjeff.jpg"
download.file(fileURL, destfile="instructorPhoto.jpg", method="curl")
install.packages("jpeg")
install(jpeg)
img <- readJPEG("instructorPhoto.jpg", native = TRUE)

Getting a Quantile from an Image:

quantile(img, c(0.3, 0.8), type = 1)

Installing a Package:

install.packages("xlsx")

Using XML/RCurl to extract data

install.packages("RCurl")
library(RCurl)
xData <- getURL(fileURL)
doc <- xmlParse(xData)
rootNode <- xmlRoot(doc)
xpathSApply(rootNode, "//zipcode", xmlValue)
zips <- xpathSApply(rootNode, "//zipcode", xmlValue)
zips[zips == "21231"]

Unzipping a zip file:

unzip("galaxyProject1.zip");

Changing the Working Directory:

setwd("./UCI HAR Dataset");

Setting Working Directory:

getwd();

Listing all the Files in a Directory:

list.files()

Clearing all data from the terminal:

cat("\014")

Removing Variables From Environment:

remove(activity_labels_data, subject_test_data, test_table1, x_test_data, x_test_data_titles, y_test_data)

Subsetting a Data Frame:

stateTFSubset = outcome$State == state
stateSubset = outcome[stateTFSubset == TRUE,]
LAAutoTFDataSet = carNEIDataSet$fips == "06037"

Getting the Cummulative Variance From PCA with Carat:

adData = data.frame(diagnosis,predictors)
inTrain = createDataPartition(adData$diagnosis, p = 3/4)[[1]]
training = adData[ inTrain,]
testing = adData[-inTrain,]

training2 <- training[, c(1, 58:69)]

preProc <- preProcess(training2[,-1], method = "pca", pcaComp = 10)

stds <- preProc$std

stds[1]^2 / sum(stds^2) + stds[2]^2 / sum(stds^2) + stds[3]^2 / sum(stds^2) + stds[4]^2 / sum(stds^2) + stds[5]^2 / sum(stds^2) + stds[6]^2 / sum(stds^2) + stds[7]^2 / sum(stds^2) + stds[8]^2 / sum(stds^2) + stds[9]^2 / sum(stds^2)

Getting the Accuracy of a PCA based Prediction:

testing2 <- testing[, c(1, 58:69)]
testPC <- predict(preProc, testing2[,-1])
confusionMatrix(testing2$diagnosis, predict(modelFit, testPC))

Getting the Cummulative Variance From PCA with prcomp:

training2prcomp <- prcomp(training2)
summary(training2prcomp)

Replacing Factors with Numbers:

training2$diagnosis <- ifelse(training2$diagnosis == "Impaired", 2, 1)

Running an R Script:

source("run_analysis.R")

getting the first to rows from a column and the 1st, 2nd, 3rd, 4th, and 5th columns with the titles:

complete_table[1:2, c(1:2, 3:5), with=FALSE]

Replacing All Values in a Table (numeric but it shouldn't matter) with Corresponding Text Values from another Table:

labeled_table2[, activity_code:=activity_labels_data[activity_code,2]]

Printing a Specific Set of Values from a Table:

labeled_table[1:10,c(1:2), with=FALSE]

Plotting a Variable in qplot against it's index:

qplot(seq_along(CompressiveStrength), CompressiveStrength, data = training)

Plotting a Variable in qplot with Colors against it's index:

cutCoarseAggregate <- cut2(training$CoarseAggregate, g=4)
qplot(seq_along(CompressiveStrength), CompressiveStrength, color= cutCoarseAggregate,data = training)

How to combine two different tables data (not based on a key):

DT1<-data.table(col1=sample(letters,5,replace=TRUE),col2=sample(LETTERS[1:5],5,replace=TRUE),col3=sample(1:2,5,replace=TRUE))
DT2<-data.table(col4=sample(1:3,10,replace=TRUE),col5=sample(LETTERS[1:5],10,replace=TRUE),col6=sample(1:100,10,replace=TRUE))
(DT1)
(DT2)



No comments:

Post a Comment