-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathrun_analysis.R
More file actions
115 lines (100 loc) · 4.59 KB
/
run_analysis.R
File metadata and controls
115 lines (100 loc) · 4.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# Website for more information: http://groupware.les.inf.puc-rio.br/har
# Training dataset from : https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv
# Testing dataset from : https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv
# Download the data if don't exist
if(!file.exists("training.csv")) {
download.file("https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv",
"training.csv",
method = "curl")
}
if(!file.exists("testing.csv")) {
download.file("https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv",
"testing.csv",
method = "curl")
}
# Load some useful packages
if(!require(data.table)) {
install.packages("data.table")
require(data.table)
}
if(!require(dplyr)) {
install.packages("dplyr")
require(dplyr)
}
if(!require(caret)) {
install.packages("caret")
require(caret)
}
# Load the training and testing datasets
training <- tbl_df(fread("training.csv",na.strings=c('#DIV/0!', '', 'NA')))
testing <- tbl_df(fread("testing.csv",na.strings=c('#DIV/0!', '', 'NA')))
# Now split the training into to as actual testing and validation
set.seed(1234) # Reproducibility
trainingDS <- createDataPartition( y = training$classe,
p = 0.7,
list = FALSE)
actual.training <- training[trainingDS,]
actual.validation <- training[-trainingDS,]
# Now clean-up the variables w/ zero variance
# Be careful, kick out the same variables in both cases
nzv <- nearZeroVar(actual.training)
actual.training <- actual.training[,-nzv]
actual.validation <- actual.validation[,-nzv]
# Remove variables that are mostly NA
mostlyNA <- sapply(actual.training,function(x) mean(is.na(x))) > 0.95
actual.training <- actual.training[,mostlyNA==FALSE]
actual.validation <- actual.validation[,mostlyNA==FALSE]
# At this point we're already down to 59 variables from 160
# See that the first 5 variables are identifiers that are
# not probably useful for prediction so get rid of those
# Dropping the total number of variables to 54 (53 for prediction)
actual.training <- actual.training[,-(1:5)]
actual.validation <- actual.validation[,-(1:5)]
# Now let's build a random forest model
set.seed(1234)
modelRF <- train( classe ~.,
data = actual.training,
method = "rf",
trControl = trainControl(method="cv",number=3) )
# Now get the prediction in the validation portion and see how well we do
prediction.validation.rf <- predict(modelRF,actual.validation)
conf.matrix.rf <- confusionMatrix(prediction.validation.rf,actual.validation$classe)
print(conf.matrix.rf) # Accuracy = 0.998
# One can also build a generalized boosted model and compare its accuracy
# to random forest model
set.seed(1234)
modelBM <- train( classe ~.,
data = actual.training,
method = "gbm",
trControl = trainControl(method="repeatedcv",number = 5,repeats = 1),
verbose = FALSE)
# Now get the prediction in the validation portion and see how well we do
prediction.validation.bm <- predict(modelBM,actual.validation)
conf.matrix.bm <- confusionMatrix(prediction.validation.bm,actual.validation$classe)
print(conf.matrix.bm) # Accuracy = 0.9876
# At this point we see RF has marginally better performance (Accuracy : 0.998)
# than the GBM (Accuracy : 0.9876) so we can go w/ either or ensemble them but that
# might be an overkill at this point - in any case they yield the same result
prediction.testing.rf <- predict(modelRF,testing)
print(prediction.testing.rf)
# [1] B A B A A E D B A A B C B A E E A B B B
# Levels: A B C D E
prediction.testing.bm <- predict(modelBM,testing)
print(prediction.testing.bm)
# [1] B A B A A E D B A A B C B A E E A B B B
# Levels: A B C D E
# We can actually look at a few variables to see which ones are the strongest
# Simply look at the BM output
print(summary(modelBM))
# As can be see the most performant variablest are:
# var rel.inf
# num_window num_window 21.35545512
# roll_belt roll_belt 18.27724938
# pitch_forearm pitch_forearm 10.13583112
# magnet_dumbbell_z magnet_dumbbell_z 6.72541291
# yaw_belt yaw_belt 6.51200291
# magnet_dumbbell_y magnet_dumbbell_y 5.05649194
# Let's look at a few plots in the training set
qplot(num_window, roll_belt , data = actual.training, col = classe)
qplot(num_window, pitch_forearm, data = actual.training, col = classe)
qplot(roll_belt , pitch_forearm, data = actual.training, col = classe)