3D PCA Plot

2023/03/22

Data comes from here (see the Github).

Code

library(tidyverse)
library(plotly)

string_to_numeric_list <- function(s) {
  s <- gsub("[\\[\\]]", "", s) # remove square brackets
  s <- strsplit(s, ",", fixed = TRUE) # split the string using commas
  # Set scipen to a high value to avoid scientific notation
  options(scipen = 999)
  
  # Filter out non-numeric characters before converting to numeric
  numeric_values <- unlist(s)
  numeric_values <- numeric_values[grepl("^-?\\d+(\\.\\d+)?([Ee]?-?\\d+)?$", numeric_values)]
  
  as.numeric(numeric_values) # convert the character vector to numeric and return
}

hgdp_tgp <- read.delim('/home/carter/code/gnomad_meta_hgdp_tgp_v1.txt', header=T, sep='\t')

# Filter out rows with NAs in population_inference.pca_scores
hgdp_tgp <- hgdp_tgp[!is.na(hgdp_tgp$population_inference.pca_scores), ]

# Now create the numeric_pca_scores list
numeric_pca_scores <- lapply(hgdp_tgp$population_inference.pca_scores, string_to_numeric_list)

pca_scores <- as.data.frame(do.call(rbind, numeric_pca_scores))

colnames(pca_scores)[1:3] <- c("PC1", "PC2", "PC3")

colors <- hgdp_tgp$hgdp_tgp_meta.Pop.colors

# Add the "hgdp_tgp_meta.Population" column to the combined_data dataframe
populations <- hgdp_tgp$hgdp_tgp_meta.Population
combined_data <- cbind(pca_scores, colors, populations)

# Convert population codes to names

population_codes <- c(
"CHB", "JPT", "CHS", "CDX", "KHV", "CHD",
"CEU", "TSI", "GBR", "FIN", "IBS",
"YRI", "LWK", "GWD", "MSL", "ESN",
"ASW", "ACB", "MXL", "PUR", "CLM", "PEL",
"GIH", "PJL", "BEB", "STU", "ITU"
)

population_names <- c(
  "Northern Han Chinese",
  "Japanese",
  "Southern Han Chinese",
  "Chinese Dai",
  "Kinh",
  "US Chinese",
  "Utah resident of European ancestry",
  "Toscani",
  "British",
  "Finnish",
  "Iberian populations",
  "Yoruba",
  "Luhya",
  "Gambian",
  "Mende",
  "Esan",
  "African American",
  "African Caribbean",
  "US Hispanic (Mexican)",
  "Puerto Rican",
  "Colombian",
  "Peruvian",
  "Gujarati Indian (in US)",
  "Punjabi",
  "Bengali",
  "Sri Lankan Tamil (in UK)",
  "Indian Telugu (in UK)"
)

pop_code_to_name <- setNames(population_names, population_codes)
combined_data$populations <- ifelse(combined_data$populations %in% population_codes,
                                    pop_code_to_name[combined_data$populations],
                                    combined_data$populations)

plot3d <- plot_ly(
  data = combined_data,
  x = ~PC1,
  y = ~PC2,
  z = ~PC3,
  type = "scatter3d",
  mode = "markers",
  marker = list(
    size = 2.5,
    color = ~colors,
    opacity = 1
  ),
  hovertemplate = paste("%{text}<extra></extra>"),
  text = ~populations
)

Plot

I’ve converted the population codes in the data to their respective population names.