Data comes from here (see the Github).
Code
library(tidyverse)
library(plotly)
string_to_numeric_list <- function(s) {
s <- gsub("[\\[\\]]", "", s) # remove square brackets
s <- strsplit(s, ",", fixed = TRUE) # split the string using commas
# Set scipen to a high value to avoid scientific notation
options(scipen = 999)
# Filter out non-numeric characters before converting to numeric
numeric_values <- unlist(s)
numeric_values <- numeric_values[grepl("^-?\\d+(\\.\\d+)?([Ee]?-?\\d+)?$", numeric_values)]
as.numeric(numeric_values) # convert the character vector to numeric and return
}
hgdp_tgp <- read.delim('/home/carter/code/gnomad_meta_hgdp_tgp_v1.txt', header=T, sep='\t')
# Filter out rows with NAs in population_inference.pca_scores
hgdp_tgp <- hgdp_tgp[!is.na(hgdp_tgp$population_inference.pca_scores), ]
# Now create the numeric_pca_scores list
numeric_pca_scores <- lapply(hgdp_tgp$population_inference.pca_scores, string_to_numeric_list)
pca_scores <- as.data.frame(do.call(rbind, numeric_pca_scores))
colnames(pca_scores)[1:3] <- c("PC1", "PC2", "PC3")
colors <- hgdp_tgp$hgdp_tgp_meta.Pop.colors
# Add the "hgdp_tgp_meta.Population" column to the combined_data dataframe
populations <- hgdp_tgp$hgdp_tgp_meta.Population
combined_data <- cbind(pca_scores, colors, populations)
# Convert population codes to names
population_codes <- c(
"CHB", "JPT", "CHS", "CDX", "KHV", "CHD",
"CEU", "TSI", "GBR", "FIN", "IBS",
"YRI", "LWK", "GWD", "MSL", "ESN",
"ASW", "ACB", "MXL", "PUR", "CLM", "PEL",
"GIH", "PJL", "BEB", "STU", "ITU"
)
population_names <- c(
"Northern Han Chinese",
"Japanese",
"Southern Han Chinese",
"Chinese Dai",
"Kinh",
"US Chinese",
"Utah resident of European ancestry",
"Toscani",
"British",
"Finnish",
"Iberian populations",
"Yoruba",
"Luhya",
"Gambian",
"Mende",
"Esan",
"African American",
"African Caribbean",
"US Hispanic (Mexican)",
"Puerto Rican",
"Colombian",
"Peruvian",
"Gujarati Indian (in US)",
"Punjabi",
"Bengali",
"Sri Lankan Tamil (in UK)",
"Indian Telugu (in UK)"
)
pop_code_to_name <- setNames(population_names, population_codes)
combined_data$populations <- ifelse(combined_data$populations %in% population_codes,
pop_code_to_name[combined_data$populations],
combined_data$populations)
plot3d <- plot_ly(
data = combined_data,
x = ~PC1,
y = ~PC2,
z = ~PC3,
type = "scatter3d",
mode = "markers",
marker = list(
size = 2.5,
color = ~colors,
opacity = 1
),
hovertemplate = paste("%{text}<extra></extra>"),
text = ~populations
)
Plot
I’ve converted the population codes in the data to their respective population names.