Chapter 4 Unsupervised or Descriptive modeling
From the descriptive (unsupervised) point of view, patterns are found to predict future behaviour or estimate. This include association rules, clustering, or tree clustering which aim at grouping together objects (e.g., animals) into successively larger clusters, using some measure of similarity or distance. The dataset will be as the previous table without the \(C\) class attribute
Att1 | Attn | |
---|---|---|
a11 | … | a1n |
a21 | … | a2n |
… | … | … |
am1 | … | amn |
4.1 Clustering
library(foreign)
library(fpc)
<- read.arff("./datasets/defectPred/D1/KC1.arff")
kc1
# Split into training and test datasets
set.seed(1)
<- sample(2, nrow(kc1), replace = TRUE, prob = c(0.7, 0.3))
ind <- kc1[ind==1, ]
kc1.train <- kc1[ind==2, ]
kc1.test
# No class
$Defective <- NULL
kc1.train
<- dbscan(kc1.train, eps = 0.42, MinPts = 5)
ds
<- kmeans(kc1.train, 2) kc1.kmeans
4.2 Association rules
library(arules)
# x <- as.numeric(kc1$LOC_TOTAL)
# str(x)
# summary(x)
# hist(x, breaks=30, main="LoC Total")
# xDisc <- discretize(x, categories=5)
# table(xDisc)
for(i in 1:21) kc1[,i] <- discretize(kc1[,i], method = "interval", breaks = 5)
<- apriori(kc1,
rules parameter = list(minlen=3, supp=0.05, conf=0.35),
appearance = list(rhs=c("Defective=Y"),
default="lhs"),
control = list(verbose=F))
#rules <- apriori(kc1,
# parameter = list(minlen=2, supp=0.05, conf=0.3),
# appearance = list(rhs=c("Defective=Y", "Defective=N"),
# default="lhs"))
inspect(rules)
## lhs rhs support confidence coverage lift count
## [1] {HALSTEAD_CONTENT=[38.6,77.2),
## HALSTEAD_LEVEL=[0,0.4)} => {Defective=Y} 0.0539 0.370 0.146 2.39 113
## [2] {LOC_CODE_AND_COMMENT=[0,2.4),
## HALSTEAD_CONTENT=[38.6,77.2)} => {Defective=Y} 0.0525 0.377 0.139 2.43 110
## [3] {LOC_CODE_AND_COMMENT=[0,2.4),
## HALSTEAD_CONTENT=[38.6,77.2),
## HALSTEAD_LEVEL=[0,0.4)} => {Defective=Y} 0.0515 0.374 0.138 2.41 108
library(arulesViz)
plot(rules)