Chapter 4 Unsupervised or Descriptive modeling

From the descriptive (unsupervised) point of view, patterns are found to predict future behaviour or estimate. This include association rules, clustering, or tree clustering which aim at grouping together objects (e.g., animals) into successively larger clusters, using some measure of similarity or distance. The dataset will be as the previous table without the \(C\) class attribute

Att₁		Att_n
a₁₁	…	a_1n
a₂₁	…	a_2n
…	…	…
a_m1	…	a_mn

4.1 Clustering

library(foreign)
library(fpc)

kc1 <- read.arff("./datasets/defectPred/D1/KC1.arff")

# Split into training and test datasets
set.seed(1)
ind <- sample(2, nrow(kc1), replace = TRUE, prob = c(0.7, 0.3))
kc1.train <- kc1[ind==1, ]
kc1.test <- kc1[ind==2, ]

# No class
kc1.train$Defective <- NULL

ds <- dbscan(kc1.train, eps = 0.42, MinPts = 5)

kc1.kmeans <- kmeans(kc1.train, 2)

4.1.1 k-Means

library(reshape, quietly=TRUE)
library(graphics)
kc1kmeans <- kmeans(sapply(na.omit(kc1.train), rescaler, "range"), 10)
#plot(kc1kmeans, col = kc1kmeans$cluster)
#points(kc1kmeans$centers, col = 1:5, pch = 8)

4.2 Association rules

library(arules)

# x <- as.numeric(kc1$LOC_TOTAL)
# str(x)
# summary(x)
# hist(x, breaks=30, main="LoC Total")
# xDisc <- discretize(x, categories=5)
# table(xDisc)

for(i in 1:21) kc1[,i] <- discretize(kc1[,i],  method = "interval", breaks = 5)

rules <- apriori(kc1,
   parameter = list(minlen=3, supp=0.05, conf=0.35),
   appearance = list(rhs=c("Defective=Y"),
   default="lhs"),
   control = list(verbose=F))

#rules <- apriori(kc1,
 #   parameter = list(minlen=2, supp=0.05, conf=0.3),
 #   appearance = list(rhs=c("Defective=Y", "Defective=N"),
 #   default="lhs"))
  
inspect(rules)

##     lhs                                rhs           support confidence coverage lift count
## [1] {HALSTEAD_CONTENT=[38.6,77.2),                                                         
##      HALSTEAD_LEVEL=[0,0.4)}        => {Defective=Y}  0.0539      0.370    0.146 2.39   113
## [2] {LOC_CODE_AND_COMMENT=[0,2.4),                                                         
##      HALSTEAD_CONTENT=[38.6,77.2)}  => {Defective=Y}  0.0525      0.377    0.139 2.43   110
## [3] {LOC_CODE_AND_COMMENT=[0,2.4),                                                         
##      HALSTEAD_CONTENT=[38.6,77.2),                                                         
##      HALSTEAD_LEVEL=[0,0.4)}        => {Defective=Y}  0.0515      0.374    0.138 2.41   108

library(arulesViz)
plot(rules)