# HW6: Extra Exercise
# CS816 Big Data Analytics
# Answered by: Aziz


#################
# Extra Exercise
#################

library('arules')
## Loading required package: Matrix
## 
## Attaching package: 'arules'
## 
## The following objects are masked from 'package:base':
## 
##     %in%, abbreviate, write
library('arulesViz')
## Loading required package: grid
## 
## Attaching package: 'arulesViz'
## 
## The following object is masked from 'package:arules':
## 
##     abbreviate
## 
## The following object is masked from 'package:base':
## 
##     abbreviate
## create the dataset file using basket format
purchases <- c("beer,diapers",
               "soda,potato,chips,hamburger,meat,milk,eggs",
               "coffee,eggs", 
               "beer,bread,cheese,ham",
               "diapers,beer,potato,chips",
               "cheese,ham,beer",
               "ham,cheese,bread,coffee,milk",
               "soda,cheese,bread,ham",
               "coffee,hamburger,meat",
               "eggs,diapers,beer")
# write to a basket file
data <- paste(purchases, sep="\n")
write(data, file = "purchases")

# read transcations from puchases "basket" file 
trans <- read.transactions("purchases", format = "basket", sep=",")
inspect(trans)
##    items                                       
## 1  {beer,diapers}                              
## 2  {chips,eggs,hamburger,meat,milk,potato,soda}
## 3  {coffee,eggs}                               
## 4  {beer,bread,cheese,ham}                     
## 5  {beer,chips,diapers,potato}                 
## 6  {beer,cheese,ham}                           
## 7  {bread,cheese,coffee,ham,milk}              
## 8  {bread,cheese,ham,soda}                     
## 9  {coffee,hamburger,meat}                     
## 10 {beer,diapers,eggs}
summary(trans)
## transactions as itemMatrix in sparse format with
##  10 rows (elements/itemsets/transactions) and
##  13 columns (items) and a density of 0.2846154 
## 
## most frequent items:
##    beer  cheese     ham   bread  coffee (Other) 
##       5       4       4       3       3      18 
## 
## element (itemset/transaction) length distribution:
## sizes
## 2 3 4 5 7 
## 2 3 3 1 1 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     2.0     3.0     3.5     3.7     4.0     7.0 
## 
## includes extended item information - examples:
##   labels
## 1   beer
## 2  bread
## 3 cheese
# apply apriori on the itemsets in the transactions

# frequent 2-itemsets
items2 <- apriori(trans, parameter=list(minlen=2, maxlen=2, support=0.3)) # 0.03
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport support minlen maxlen
##         0.8    0.1    1 none FALSE            TRUE     0.3      2      2
##  target   ext
##   rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## apriori - find association rules with the apriori algorithm
## version 4.21 (2004.05.09)        (c) 1996-2004   Christian Borgelt
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[13 item(s), 10 transaction(s)] done [0.00s].
## sorting and recoding items ... [7 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 done [0.00s].
## writing ... [5 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
summary(items2)
## set of 5 rules
## 
## rule length distribution (lhs + rhs):sizes
## 2 
## 5 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       2       2       2       2       2       2 
## 
## summary of quality measures:
##     support       confidence      lift    
##  Min.   :0.30   Min.   :1    Min.   :2.0  
##  1st Qu.:0.30   1st Qu.:1    1st Qu.:2.5  
##  Median :0.30   Median :1    Median :2.5  
##  Mean   :0.34   Mean   :1    Mean   :2.4  
##  3rd Qu.:0.40   3rd Qu.:1    3rd Qu.:2.5  
##  Max.   :0.40   Max.   :1    Max.   :2.5  
## 
## mining info:
##   data ntransactions support confidence
##  trans            10     0.3        0.8
inspect(sort(items2, by ="support"))
##   lhs          rhs      support confidence lift
## 4 {cheese}  => {ham}    0.4     1          2.5 
## 5 {ham}     => {cheese} 0.4     1          2.5 
## 1 {diapers} => {beer}   0.3     1          2.0 
## 2 {bread}   => {cheese} 0.3     1          2.5 
## 3 {bread}   => {ham}    0.3     1          2.5
# frequent 3-itemsets
items3 <- apriori(trans, parameter=list(minlen=3, maxlen=3, support=0.3))
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport support minlen maxlen
##         0.8    0.1    1 none FALSE            TRUE     0.3      3      3
##  target   ext
##   rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## apriori - find association rules with the apriori algorithm
## version 4.21 (2004.05.09)        (c) 1996-2004   Christian Borgelt
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[13 item(s), 10 transaction(s)] done [0.00s].
## sorting and recoding items ... [7 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 done [0.00s].
## writing ... [2 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
summary(items3)
## set of 2 rules
## 
## rule length distribution (lhs + rhs):sizes
## 3 
## 2 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       3       3       3       3       3       3 
## 
## summary of quality measures:
##     support      confidence      lift    
##  Min.   :0.3   Min.   :1    Min.   :2.5  
##  1st Qu.:0.3   1st Qu.:1    1st Qu.:2.5  
##  Median :0.3   Median :1    Median :2.5  
##  Mean   :0.3   Mean   :1    Mean   :2.5  
##  3rd Qu.:0.3   3rd Qu.:1    3rd Qu.:2.5  
##  Max.   :0.3   Max.   :1    Max.   :2.5  
## 
## mining info:
##   data ntransactions support confidence
##  trans            10     0.3        0.8
inspect(sort(items3, by ="support"))
##   lhs               rhs      support confidence lift
## 1 {bread,cheese} => {ham}    0.3     1          2.5 
## 2 {bread,ham}    => {cheese} 0.3     1          2.5
# frequent 4-itemsets 
items4 <- apriori(trans, parameter=list(minlen=4, maxlen=4, support=0.3))
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport support minlen maxlen
##         0.8    0.1    1 none FALSE            TRUE     0.3      4      4
##  target   ext
##   rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## apriori - find association rules with the apriori algorithm
## version 4.21 (2004.05.09)        (c) 1996-2004   Christian Borgelt
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[13 item(s), 10 transaction(s)] done [0.00s].
## sorting and recoding items ... [7 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 done [0.00s].
## writing ... [0 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
summary(items4)
## set of 0 rules
##############################
# Generate and Visualize Rules
##############################

# run Apriori without max
rules <- apriori(trans, parameter=list(minlen=1, support=0.3))
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport support minlen maxlen
##         0.8    0.1    1 none FALSE            TRUE     0.3      1     10
##  target   ext
##   rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## apriori - find association rules with the apriori algorithm
## version 4.21 (2004.05.09)        (c) 1996-2004   Christian Borgelt
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[13 item(s), 10 transaction(s)] done [0.00s].
## sorting and recoding items ... [7 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 done [0.00s].
## writing ... [7 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
summary(rules)
## set of 7 rules
## 
## rule length distribution (lhs + rhs):sizes
## 2 3 
## 5 2 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   2.000   2.000   2.286   2.500   3.000 
## 
## summary of quality measures:
##     support         confidence      lift      
##  Min.   :0.3000   Min.   :1    Min.   :2.000  
##  1st Qu.:0.3000   1st Qu.:1    1st Qu.:2.500  
##  Median :0.3000   Median :1    Median :2.500  
##  Mean   :0.3286   Mean   :1    Mean   :2.429  
##  3rd Qu.:0.3500   3rd Qu.:1    3rd Qu.:2.500  
##  Max.   :0.4000   Max.   :1    Max.   :2.500  
## 
## mining info:
##   data ntransactions support confidence
##  trans            10     0.3        0.8
inspect(rules)
##   lhs               rhs      support confidence lift
## 1 {diapers}      => {beer}   0.3     1          2.0 
## 2 {bread}        => {cheese} 0.3     1          2.5 
## 3 {bread}        => {ham}    0.3     1          2.5 
## 4 {cheese}       => {ham}    0.4     1          2.5 
## 5 {ham}          => {cheese} 0.4     1          2.5 
## 6 {bread,cheese} => {ham}    0.3     1          2.5 
## 7 {bread,ham}    => {cheese} 0.3     1          2.5
# visualize
rules <- apriori(trans, parameter=list(minlen=2, support=0.3, confidence=0.3, target = "rules"))
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport support minlen maxlen
##         0.3    0.1    1 none FALSE            TRUE     0.3      2     10
##  target   ext
##   rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## apriori - find association rules with the apriori algorithm
## version 4.21 (2004.05.09)        (c) 1996-2004   Christian Borgelt
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[13 item(s), 10 transaction(s)] done [0.00s].
## sorting and recoding items ... [7 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 done [0.00s].
## writing ... [11 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
summary(rules)
## set of 11 rules
## 
## rule length distribution (lhs + rhs):sizes
## 2 3 
## 8 3 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   2.000   2.000   2.273   2.500   3.000 
## 
## summary of quality measures:
##     support         confidence          lift      
##  Min.   :0.3000   Min.   :0.6000   Min.   :2.000  
##  1st Qu.:0.3000   1st Qu.:0.7500   1st Qu.:2.500  
##  Median :0.3000   Median :1.0000   Median :2.500  
##  Mean   :0.3182   Mean   :0.8955   Mean   :2.409  
##  3rd Qu.:0.3000   3rd Qu.:1.0000   3rd Qu.:2.500  
##  Max.   :0.4000   Max.   :1.0000   Max.   :2.500  
## 
## mining info:
##   data ntransactions support confidence
##  trans            10     0.3        0.3
inspect(rules)
##    lhs               rhs       support confidence lift
## 1  {diapers}      => {beer}    0.3     1.00       2.0 
## 2  {beer}         => {diapers} 0.3     0.60       2.0 
## 3  {bread}        => {cheese}  0.3     1.00       2.5 
## 4  {cheese}       => {bread}   0.3     0.75       2.5 
## 5  {bread}        => {ham}     0.3     1.00       2.5 
## 6  {ham}          => {bread}   0.3     0.75       2.5 
## 7  {cheese}       => {ham}     0.4     1.00       2.5 
## 8  {ham}          => {cheese}  0.4     1.00       2.5 
## 9  {bread,cheese} => {ham}     0.3     1.00       2.5 
## 10 {bread,ham}    => {cheese}  0.3     1.00       2.5 
## 11 {cheese,ham}   => {bread}   0.3     0.75       2.5
plot(rules)

plot(rules@quality)

confidentRules <- rules[quality(rules)$confidence > 0.3]
inspect(confidentRules)
##    lhs               rhs       support confidence lift
## 1  {diapers}      => {beer}    0.3     1.00       2.0 
## 2  {beer}         => {diapers} 0.3     0.60       2.0 
## 3  {bread}        => {cheese}  0.3     1.00       2.5 
## 4  {cheese}       => {bread}   0.3     0.75       2.5 
## 5  {bread}        => {ham}     0.3     1.00       2.5 
## 6  {ham}          => {bread}   0.3     0.75       2.5 
## 7  {cheese}       => {ham}     0.4     1.00       2.5 
## 8  {ham}          => {cheese}  0.4     1.00       2.5 
## 9  {bread,cheese} => {ham}     0.3     1.00       2.5 
## 10 {bread,ham}    => {cheese}  0.3     1.00       2.5 
## 11 {cheese,ham}   => {bread}   0.3     0.75       2.5
plot(confidentRules, method="matrix", control=list(reorder=TRUE))
## Itemsets in Antecedent (LHS)
## [1] "{cheese}"       "{cheese,ham}"   "{ham}"          "{bread,ham}"   
## [5] "{bread}"        "{bread,cheese}" "{diapers}"      "{beer}"        
## Itemsets in Consequent (RHS)
## [1] "{cheese}"  "{beer}"    "{diapers}" "{bread}"   "{ham}"

# displays rules with top lift scores
inspect(head(sort(rules, by="lift"), 10))
##    lhs               rhs      support confidence lift
## 3  {bread}        => {cheese} 0.3     1.00       2.5 
## 4  {cheese}       => {bread}  0.3     0.75       2.5 
## 5  {bread}        => {ham}    0.3     1.00       2.5 
## 6  {ham}          => {bread}  0.3     0.75       2.5 
## 7  {cheese}       => {ham}    0.4     1.00       2.5 
## 8  {ham}          => {cheese} 0.4     1.00       2.5 
## 9  {bread,cheese} => {ham}    0.3     1.00       2.5 
## 10 {bread,ham}    => {cheese} 0.3     1.00       2.5 
## 11 {cheese,ham}   => {bread}  0.3     0.75       2.5 
## 1  {diapers}      => {beer}   0.3     1.00       2.0
# select the 5 rules with the highest confidence
highConfidenceRules <- head(sort(rules, by="confidence"), 5)
plot(highConfidenceRules, method="graph", control=list(type="items"))

# select the 5 rules with the highest Lift
highLiftRules <- head(sort(rules, by="lift"), 5)
plot(highLiftRules, method="graph", control=list(type="items"))