In this document you can find the proposed solutions of the Test that is proposed in https://github.com/rstats-gsoc/gsoc2019/wiki/Parallel-Coordinate-Plots-in-ggplot2

Easy

Loading the packages

library("GGally")

## Loading required package: ggplot2

library("ggparallel")

Highlight the main purpose of ggparcoord: Abalone dataset with predictive purposes (predict the age of abalone from physical measurements)

df <- read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data", sep=",")
# data source: http://archive.ics.uci.edu/ml/datasets/Abalone
df$V1 <- as.factor(df$V1)
ggparcoord(data = df, columns = 2:ncol(df), groupColumn = 1, order = "anyClass",
                showPoints = TRUE, title = "Parallel Coordinate Plot for the Abalone Data",
                alphaLines = 0.3)

We can clearly see that when V1 = I it differentiates in all variables from the rest.

Highlight the main purpose of ggparallel: Breast Cancer Data Set

bc <- read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer/breast-cancer.data", sep=",", head=T)
# data source: http://archive.ics.uci.edu/ml/datasets/Breast+Cancer
names(bc)[2] <- "age"
ggparallel(names(bc)[c(1:3,1)], order=0, bc) +
  scale_fill_brewer(palette="Paired", guide="none") +
  scale_colour_brewer(palette="Paired", guide="none")

## Warning: attributes are not identical across measure variables; they will
## be dropped

We can see that recurrent events in breast cancer are less common in younger ages, however the rest of variables are very disperse.

Medium

First approach

library(tidyr)  
gg_color_hue <- function(n) {
  hues = seq(15, 375, length = n + 1)
  hcl(h = hues, l = 65, c = 100)[1:n]
}

df2 <- df
df2[,-1] <- apply(df2[,-1],2,scale)
df2 <- df2 %>% gather(variable, value,-V1)

df2$variable <- factor(df2$variable)
df2$variable <- factor(df2$variable,levels(df2$variable)[c(2,4,6,1,7,5,3,8)])  # same order than previous
df2$id <- 1:nrow(df) # it will be repeated within each variable

pal <- gg_color_hue(length(levels(df2$V1)))
df2 %>%
  ggplot() +
  geom_line(aes(x = variable, y = value, group = id, color=V1) )+
  geom_point(aes(x=variable, y= value), color=pal[as.numeric(df2$V1)])

The function

f.parallelCoor <- function(data, groupVble = "V1"){
  # colour palete in ggplot2
gg_color_hue <- function(n) {
  hues = seq(15, 375, length = n + 1)
  hcl(h = hues, l = 65, c = 100)[1:n]
}

  
  gVI <-which(names(data)==groupVble)  # group variable index
  df2 <- data
  df2[,-gVI] <- apply(df2[,-gVI],2,scale)
  df2 <- df2 %>% gather(variable, value,-groupVble)
  
  df2$id <- 1:nrow(data) # it will be repeated within each variable
  df2 <- as.data.frame(df2)
  pal <- gg_color_hue(length(levels(df2[,gVI])))
  
  df2 %>%
    ggplot() +
    geom_line(aes(x = variable, y = value, group = id, color=df2[,gVI]) )+
    geom_point(aes(x=variable, y= value), color=pal[as.numeric(df2[,gVI])]) 
}

f.parallelCoor(df,"V1")

diamonds.samp <- diamonds[sample(1:dim(diamonds)[1], 100), ]
f.parallelCoor(diamonds.samp[c(1,2,5:10)] ,"cut")

Hard

Expand the functionality to include jittering (done)

library (plyr)

f.parallelCoor <- function(data, groupVble = "V1", jit=F, jitFactor =10){  # big jitFactor to make it visible
  # colour palete in ggplot2
gg_color_hue <- function(n) {
  hues = seq(15, 375, length = n + 1)
  hcl(h = hues, l = 65, c = 100)[1:n]
}

  
  gVI <-which(names(data)==groupVble)  # group variable index
  df2 <- data
  df2[,-gVI] <- apply(df2[,-gVI],2,scale)
  df2 <- df2 %>% gather(variable, value,-groupVble)
  
  df2$id <- 1:nrow(data) # it will be repeated within each variable
  if(jit){
    X <- by(df2$value, df2$variable, jitter, factor=jitFactor)
    XX2 <- ldply (X, data.frame)
    df2[c("variable", "value")] <- XX2
  }
  df2 <- as.data.frame(df2)
  pal <- gg_color_hue(length(levels(df2[,gVI])))
  
  df2 %>%
    ggplot() +
    geom_line(aes(x = variable, y = value, group = id, color=df2[,gVI]) )+
    geom_point(aes(x=variable, y= value), color=pal[as.numeric(df2[,gVI])]) 
}

f.parallelCoor(df,"V1", jit=T)

diamonds.samp <- diamonds[sample(1:dim(diamonds)[1], 100), ]
f.parallelCoor(diamonds.samp[c(1,2,5:10)] ,"cut", jit=T)

Link to the package

As part of the hard solution, I created the pcplots package. See it here: https://github.com/auroragonzalez/pcplots

Test

Aurora González Vidal

5 de abril de 2019