This repository was archived by the owner on May 21, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclustering.R
140 lines (134 loc) · 5.69 KB
/
clustering.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
library(kernlab)
library(tm)
source("connect.R")
source("gapkernel.R")
#Reading and sampling of the data
N= 50 # number of data point
reuters <- read.table("reuters.txt.gz", header=TRUE)
reuters <- reuters[reuters$Topic == "crude" | reuters$Topic == "grain" | reuters$Topic == "coffee",]
reuters <- reuters[sample(1:nrow(reuters),N),]
reuters$Content <- as.character(reuters$Content)
reuters$Topic <- factor(reuters$Topic)
## Preprocessing of the data
docs <- Corpus(VectorSource(reuters$Content))
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, stripWhitespace)
docs <- tm_map(docs,removeNumbers)
reuters$Content <- sapply(docs, function(x){x$content})
## function that cluster the data with matrix and params
cluster <- function(kernelMatrix, k, output){
set.seed(20)
sc <- kkmeans(kernelMatrix, k, nstart = 10)
if(output){
vec <-vector(length = N)
for(x in 1:N){ vec[x] = sc[x] }
return( list( table( vec, reuters$Topic ), withinss(sc), sum( withinss(sc) ) ) )
}
if(!output){
return (sum (withinss(sc)) )
}
}
## shortest text length in the sample
maxLength <- function( TextSet ){
min( sapply( TextSet, function(x){ nchar(x) } ) )
}
## try a possible string length
bestLength <- function(kernelName, k, dataSet){
max <- 15#maxLength(dataSet)
scores <- sapply( seq(2,max), function(x) { param <- stringdot(kernelName, length = x)
kernelMatrix <- kernelMatrix(param,dataSet)
possibleError <- tryCatch(
sc <- cluster(kernelMatrix, k, FALSE),
error=function(e) return(NA)
)
if(!inherits(possibleError, "error")){
return (sc)
}
})
return( which.min(scores) + 1 )
}
writeInFile<- function(text,R){
suppressWarnings(write(text,file = 'ResultsClustering.txt', append=TRUE))
suppressWarnings(write.table( R[[1]],file = 'ResultsClustering.txt', append=TRUE ,sep = ","))
suppressWarnings(write(R[[2]],file = 'ResultsClustering.txt', append=TRUE))
suppressWarnings(write(R[[3]],file = 'ResultsClustering.txt', append=TRUE))
}
suppressWarnings(write( paste("Experiment with ",as.character(N)," texts"),file = 'ResultsClustering.txt', append=TRUE))
suppressWarnings(write.table(table(reuters$Topic),file = 'ResultsClustering.txt', append=TRUE ,sep = ","))
clusters = 3
##Calculate the clusters using spectrum kernel
n <- bestLength("spectrum",clusters,reuters$Content)
k <- stringdot( "spectrum", length = n )
K <- kernelMatrix(k, reuters$Content)
R <- cluster(K, clusters, TRUE)
writeInFile("SPECTRUM KERNEL", R)
suppressWarnings(write( as.character(n),file = 'ResultsClustering.txt', append=TRUE))
##Calculate the clusteres using the constant kernel
k <- stringdot("constant",length = 2)
K <- kernelMatrix(k, reuters$Content)
R <- cluster(K, clusters, TRUE)
writeInFile("CONSTANT KERNEL",R)
suppressWarnings(write( as.character(n),file = 'ResultsClustering.txt', append=TRUE))
##Calculate the clusters using the bound range kernel
n = bestLength("boundrange", clusters, reuters$Content)
k <- stringdot("boundrange",length=n)
K <- kernelMatrix(k, reuters$Content)
R <- cluster(K, clusters, TRUE)
writeInFile("BOUNDRANGE KERNEL",R)
suppressWarnings(write( as.character(n),file = 'ResultsClustering.txt', append=TRUE))
##Calculate the cluster using the exponential kernel
##Define lambda range
lambdas = seq(1.2,2,0.2)
optimizeExponential <-function(data, k, lambdas){
max <- 10#maxLength(data)
Scores <- matrix(NA, length(lambdas), max-1)
for(i in 2:max){
for(j in 1:length(lambdas)){
param <- stringdot(length = i, lambda = lambdas[j], type="exponential" )
matrix <- kernelMatrix( param, data)
possibleError <- tryCatch(
sc <- kkmeans(matrix, k, nstart = 10),
error=function(e) print("Param combination does not work")
)
if(!inherits(possibleError, "error")){
Scores[j,i-1] = sum(withinss(sc))
}
}
}
which(Scores == min(Scores, na.rm = TRUE), arr.ind = TRUE)
}
m <- optimizeExponential(reuters$Content, clusters, lambdas )
k <- stringdot(length = m[2]+1, lambda = lambdas[m[1]],type = "exponential" )
K <- kernelMatrix(k, reuters$Content)
R<- cluster(K, clusters, TRUE)
writeInFile("EXPONENTIAL KERNEL",R)
suppressWarnings(write( paste(as.character(m[2]+1),as.character(lambdas[m[1]])," "),file = 'ResultsClustering.txt', append=TRUE))
##for gap Kernel
optimizeGAP <- function(data, k, lambdas, begin, limit){
print("enter gap")
Scores <- matrix(NA, length(lambdas), limit)
for(i in begin:limit){
for(j in 1:length(lambdas)){
param <- makeGapKernel( lambdas[j], i)
matrix <- kernelMatrix( param, data)
possibleError <- tryCatch({
sc <- kkmeans(matrix, k, nstart = 10)
Scores[j,i-1] = sum(withinss(sc))},
error=function(e) print("param combination does not work")
)
}
}
which(Scores == min(Scores, na.rm = TRUE), arr.ind = TRUE)
}
##Launch the gap kernel
lambdas = seq(0.1,0.9,0.1)
fc = optimizeGAP(reuters$Content,clusters,lambdas,6,8)
k <- makeGapKernel(lambdas[fc[1]], fc[2]+1)
K <- kernelMatrix(k, reuters$Content)
writeInFile("GAP KERNEL",R)
suppressWarnings(write( paste(as.character(fc[2]+1),as.character(lambdas[fc[1]])," "),file = 'ResultsClustering.txt', append=TRUE))
##Clustering using the connect kernel
k <- new("kernel", .Data=connect, kpar=list())
K <- kernelMatrix(k ,reuters$Content)
R <- cluster(K, clusters, TRUE)
writeInFile("CONNECT",R)