Chapter 4 Unsupervised or Descriptive modeling

From the descriptive (unsupervised) point of view, patterns are found to predict future behaviour or estimate. This include association rules, clustering, or tree clustering which aim at grouping together objects (e.g., animals) into successively larger clusters, using some measure of similarity or distance. The dataset will be as the previous table without the \(C\) class attribute

Att1 Attn
a11 a1n
a21 a2n
am1 amn

4.1 Clustering

library(foreign)
library(fpc)

kc1 <- read.arff("./datasets/defectPred/D1/KC1.arff")

# Split into training and test datasets
set.seed(1)
ind <- sample(2, nrow(kc1), replace = TRUE, prob = c(0.7, 0.3))
kc1.train <- kc1[ind==1, ]
kc1.test <- kc1[ind==2, ]

# No class
kc1.train$Defective <- NULL

ds <- dbscan(kc1.train, eps = 0.42, MinPts = 5)

kc1.kmeans <- kmeans(kc1.train, 2)

4.1.1 k-Means

library(reshape, quietly=TRUE)
library(graphics)
kc1kmeans <- kmeans(sapply(na.omit(kc1.train), rescaler, "range"), 10)
#plot(kc1kmeans, col = kc1kmeans$cluster)
#points(kc1kmeans$centers, col = 1:5, pch = 8)

4.2 Association rules

library(arules)

# x <- as.numeric(kc1$LOC_TOTAL)
# str(x)
# summary(x)
# hist(x, breaks=30, main="LoC Total")
# xDisc <- discretize(x, categories=5)
# table(xDisc)

for(i in 1:21) kc1[,i] <- discretize(kc1[,i],  method = "interval", breaks = 5)

rules <- apriori(kc1,
   parameter = list(minlen=3, supp=0.05, conf=0.35),
   appearance = list(rhs=c("Defective=Y"),
   default="lhs"),
   control = list(verbose=F))

#rules <- apriori(kc1,
 #   parameter = list(minlen=2, supp=0.05, conf=0.3),
 #   appearance = list(rhs=c("Defective=Y", "Defective=N"),
 #   default="lhs"))
  
inspect(rules)
##     lhs                                rhs           support confidence coverage lift count
## [1] {HALSTEAD_CONTENT=[38.6,77.2),                                                         
##      HALSTEAD_LEVEL=[0,0.4)}        => {Defective=Y}  0.0539      0.370    0.146 2.39   113
## [2] {LOC_CODE_AND_COMMENT=[0,2.4),                                                         
##      HALSTEAD_CONTENT=[38.6,77.2)}  => {Defective=Y}  0.0525      0.377    0.139 2.43   110
## [3] {LOC_CODE_AND_COMMENT=[0,2.4),                                                         
##      HALSTEAD_CONTENT=[38.6,77.2),                                                         
##      HALSTEAD_LEVEL=[0,0.4)}        => {Defective=Y}  0.0515      0.374    0.138 2.41   108
library(arulesViz)
plot(rules)