In this document you can find the proposed solutions of the Test that is proposed in https://github.com/rstats-gsoc/gsoc2019/wiki/Parallel-Coordinate-Plots-in-ggplot2
Loading the packages
library("GGally")
## Loading required package: ggplot2
library("ggparallel")
df <- read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data", sep=",")
# data source: http://archive.ics.uci.edu/ml/datasets/Abalone
df$V1 <- as.factor(df$V1)
ggparcoord(data = df, columns = 2:ncol(df), groupColumn = 1, order = "anyClass",
showPoints = TRUE, title = "Parallel Coordinate Plot for the Abalone Data",
alphaLines = 0.3)
We can clearly see that when V1 = I it differentiates in all variables from the rest.
bc <- read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer/breast-cancer.data", sep=",", head=T)
# data source: http://archive.ics.uci.edu/ml/datasets/Breast+Cancer
names(bc)[2] <- "age"
ggparallel(names(bc)[c(1:3,1)], order=0, bc) +
scale_fill_brewer(palette="Paired", guide="none") +
scale_colour_brewer(palette="Paired", guide="none")
## Warning: attributes are not identical across measure variables; they will
## be dropped
We can see that recurrent events in breast cancer are less common in younger ages, however the rest of variables are very disperse.
library(tidyr)
gg_color_hue <- function(n) {
hues = seq(15, 375, length = n + 1)
hcl(h = hues, l = 65, c = 100)[1:n]
}
df2 <- df
df2[,-1] <- apply(df2[,-1],2,scale)
df2 <- df2 %>% gather(variable, value,-V1)
df2$variable <- factor(df2$variable)
df2$variable <- factor(df2$variable,levels(df2$variable)[c(2,4,6,1,7,5,3,8)]) # same order than previous
df2$id <- 1:nrow(df) # it will be repeated within each variable
pal <- gg_color_hue(length(levels(df2$V1)))
df2 %>%
ggplot() +
geom_line(aes(x = variable, y = value, group = id, color=V1) )+
geom_point(aes(x=variable, y= value), color=pal[as.numeric(df2$V1)])
f.parallelCoor <- function(data, groupVble = "V1"){
# colour palete in ggplot2
gg_color_hue <- function(n) {
hues = seq(15, 375, length = n + 1)
hcl(h = hues, l = 65, c = 100)[1:n]
}
gVI <-which(names(data)==groupVble) # group variable index
df2 <- data
df2[,-gVI] <- apply(df2[,-gVI],2,scale)
df2 <- df2 %>% gather(variable, value,-groupVble)
df2$id <- 1:nrow(data) # it will be repeated within each variable
df2 <- as.data.frame(df2)
pal <- gg_color_hue(length(levels(df2[,gVI])))
df2 %>%
ggplot() +
geom_line(aes(x = variable, y = value, group = id, color=df2[,gVI]) )+
geom_point(aes(x=variable, y= value), color=pal[as.numeric(df2[,gVI])])
}
f.parallelCoor(df,"V1")
diamonds.samp <- diamonds[sample(1:dim(diamonds)[1], 100), ]
f.parallelCoor(diamonds.samp[c(1,2,5:10)] ,"cut")
library (plyr)
f.parallelCoor <- function(data, groupVble = "V1", jit=F, jitFactor =10){ # big jitFactor to make it visible
# colour palete in ggplot2
gg_color_hue <- function(n) {
hues = seq(15, 375, length = n + 1)
hcl(h = hues, l = 65, c = 100)[1:n]
}
gVI <-which(names(data)==groupVble) # group variable index
df2 <- data
df2[,-gVI] <- apply(df2[,-gVI],2,scale)
df2 <- df2 %>% gather(variable, value,-groupVble)
df2$id <- 1:nrow(data) # it will be repeated within each variable
if(jit){
X <- by(df2$value, df2$variable, jitter, factor=jitFactor)
XX2 <- ldply (X, data.frame)
df2[c("variable", "value")] <- XX2
}
df2 <- as.data.frame(df2)
pal <- gg_color_hue(length(levels(df2[,gVI])))
df2 %>%
ggplot() +
geom_line(aes(x = variable, y = value, group = id, color=df2[,gVI]) )+
geom_point(aes(x=variable, y= value), color=pal[as.numeric(df2[,gVI])])
}
f.parallelCoor(df,"V1", jit=T)
diamonds.samp <- diamonds[sample(1:dim(diamonds)[1], 100), ]
f.parallelCoor(diamonds.samp[c(1,2,5:10)] ,"cut", jit=T)
As part of the hard solution, I created the pcplots package. See it here: https://github.com/auroragonzalez/pcplots