This repository was archived by the owner on May 21, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexampleCode.R
162 lines (110 loc) · 5.45 KB
/
exampleCode.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
set.seed(6046)
library(kernlab)
################################################
# SVM for classification with a string kernel
################################################
## We are going to use a slightly-processed version of the famous
## Reuters news articles dataset. All articles with no Topic
## annotations are dropped. The text of each article is converted to
## lowercase, whitespace is normalized to single-spaces. Only the
## first term from the Topic annotation list is retained (some
## articles have several topics assigned).
## The resulting dataset is a list of pairs (Topic, News content) We willl only use three topics for analysis: Crude Oil, Coffee and Grain-related news
## The resulting data frame contains 994 news items on crude oil,
## coffee and grain. The news text is the column "Content" and its
## category is the column "Topic". The goal is to create a classifier
## for the news articles.
## Note that we can directly read the compressed version (reuters.txt.gz).
## There is no need to unpack the gz file; for local files R handles unpacking automagically
reuters <- read.table("reuters.txt.gz", header=T)
# We leave only three topics for analysis: Crude Oil, Coffee and Grain-related news
reuters <- reuters[reuters$Topic == "crude" | reuters$Topic == "grain" | reuters$Topic == "coffee",]
reuters$Content <- as.character(reuters$Content) # R originally loads this as factor, so needs fixing
reuters$Topic <- factor(reuters$Topic) # re-level the factor to have only three levels
levels(reuters$Topic)
length(reuters$Topic)
table(reuters$Topic)
## an example of a text about coffee
reuters[2,]
## an example of a text about grain
reuters[7,]
## an example of a text about crude oil
reuters[12,]
(N <- dim(reuters)[1]) # number of rows
# we shuffle the data first
set.seed(12)
reuters <- reuters[sample(1:N, N),]
# To deal with textual data we need to use a string kernel. Several such kernels are implemented in the "stringdot" method of the kernlab package. We shall use the simplest one: the p-spectrum kernel. The feature map represents the string as a multiset of its substrings of length p
# Example, for p=2 we have
# phi("ababc") = ("ab" -> 2, "ba" -> 1, "bc" --> 1, other -> 0)
# we can define a normalized 3-spectrum kernel (p is length)
k <- stringdot("spectrum", length=3, normalized=T)
# Let's see some examples:
k("I did it my way", "I did it my way")
k("He did it his way", "I did it my way")
k("I did it my way", "She did it her way")
k("I did it my way", "Let's get our way out")
## We start by doing a kPCA (we'll see this in the next lecture)
## first we define a modified plotting function
plotting <-function (kernelfu, kerneln)
{
xpercent <- eig(kernelfu)[1]/sum(eig(kernelfu))*100
ypercent <- eig(kernelfu)[2]/sum(eig(kernelfu))*100
plot(rotated(kernelfu), col=as.integer(reuters$Topic),
main=paste(paste("Kernel PCA (", kerneln, ")", format(xpercent+ypercent,digits=3)), "%"),
xlab=paste("1st PC -", format(xpercent,digits=3), "%"),
ylab=paste("2nd PC -", format(ypercent,digits=3), "%"))
}
## Create a kernel matrix using 'k' as kernel
k <- stringdot("spectrum", length=5, normalized=T)
K <- kernelMatrix(k, reuters$Content)
dim(K)
K[2,2]
K[2,3:10]
## Plot the result using the first 2 PCs (we can add colors for the two classes)
kpc.reuters <- kpca (K, features=2, kernel="matrix")
plotting (kpc.reuters,"5 - spectrum kernel")
## finally add a legend
legend("topleft", legend=c("crude oil", "coffee","grain"),
pch=c(1,1), # gives appropriate symbols
col=c("red","black", "green")) # gives the correct color
## We can also train a SVM using this kernel matrix in the training set
## First we should split the data into learning (2/3) and test (1/3) parts
ntrain <- round(N*2/3) # number of training examples
tindex <- sample(N,ntrain) # indices of training examples
## The fit a SVM in the train part
svm1.train <- ksvm (K[tindex,tindex],reuters$Topic[tindex], type="C-svc", kernel='matrix')
## and make it predict the test part
## Let's call SV the set of obtained support vectors
## Then it becomes tricky. We must compute the test-vs-SV kernel matrix
## which we do in two phases:
# First the test-vs-train matrix
testK <- K[-tindex,tindex]
# then we extract the SV from the train
testK <- testK[,SVindex(svm1.train),drop=FALSE]
# Now we can predict the test data
# Warning: here we MUST convert the matrix testK to a 'kernelMatrix'
y1 <- predict(svm1.train,as.kernelMatrix(testK))
table (pred=y1, truth=reuters$Topic[-tindex])
cat('Error rate = ',100*sum(y1!=reuters$Topic[-tindex])/length(y1),'%')
## now we define a 3D plotting function
library("rgl")
open3d()
plotting3D <-function (kernelfu, kerneln)
{
xpercent <- eig(kernelfu)[1]/sum(eig(kernelfu))*100
ypercent <- eig(kernelfu)[2]/sum(eig(kernelfu))*100
zpercent <- eig(kernelfu)[3]/sum(eig(kernelfu))*100
# resize window
par3d(windowRect = c(100, 100, 612, 612))
plot3d(rotated(kernelfu),
col = as.integer(reuters$Topic),
xlab = paste("1st PC -", format(xpercent,digits=3), "%"),
ylab = paste("2nd PC -", format(ypercent,digits=3), "%"),
zlab = paste("3rd PC -", format(zpercent,digits=3), "%"),
main = paste("Kernel PCA"),
sub = "red - crude oil | black - coffee | green - grain",
top = TRUE, aspect = FALSE, expand = 1.03)
}
kpc.reuters <- kpca (K, features=3, kernel="matrix")
plotting3D (kpc.reuters,"5 - spectrum kernel")