library(tidyverse)
setwd("")
################### DATA PREPROCESSING 2020 ################
h2020 <- read_csv("hanson2020.csv")
colnames(h2020) <- make.names(colnames(h2020),allow_=F)
h2020 <- h2020[,-c(1,4,6,7,8,9,10,11,12,13,14,67)]
# Function to replace NA with minimum value in each row
replace_na_with_min <- function(row) {
min_val <- min(row, na.rm = TRUE)
row[is.na(row)] <- min_val
return(row)
}
# impute missing values with minimum from each metabolite
h2020[h2020==0] <- NA
h2020 <- bind_cols(h2020[,c(1,2,3)],as_tibble(t(apply(h2020[,-c(1,2,3)], 1, replace_na_with_min))))
# normalise by 10,000 units of intensity seen in that sample
h2020 <- h2020 %>% mutate_at(vars(C1

26), funs((./sum(.))*10000))
################### DATA PREPROCESSING 2022 ################
h2022 <- read_csv("hanson2022_raw2.csv",col_names=F)
colnames(h2022) <- make.names(c(as.character(h2022[8,1:15]),"uniqID",1:404))
lookup22 <- rbind(colnames(h2022),h2022[1:7,])
lookup22 <- lookup22[,-c(1:15)] %>% t %>% as_tibble
colnames(lookup22) <- make.names(lookup22[1,])
lookup22 <- lookup22[-1,]
h2022 <- h2022[-c(1:8),]
h2022 <- h2022[,-c(1,4,6,7,8,9,10,11,12,13,14,15,16,421,422)]
# replace NAs with minimum values, then remove metabolites with no data
h2022 <- h2022 %>% mutate_at(vars(X1:X404), funs(as.numeric(gsub(",", "", ., fixed = TRUE))))
h2022$COMP.ID <- as.numeric(h2022$COMP.ID)
h2022 <- bind_cols(h2022[,c(1,2,3)],as_tibble(t(apply(h2022[,-c(1,2,3)], 1, replace_na_with_min))))
h2022 <- h2022 %>%
mutate(across(everything(), ~replace(., is.infinite(.), 0)))
h2022 <- h2022[-which(rowSums(h2022[,-c(1,2,3)]) == 0),]
# normalise by 10,000 units of intensity seen in that sample
h2022 <- h2022 %>% mutate_at(vars(X1:X404), funs((./sum(.))*10000))
# filter to only D1-PRE (TO LOOK AT OTHER POINTS JUST CHANGE D1-PRE BELOW TO WHATEVER)
uniqID_to_remove <- filter(lookup22,Timepoint!="D1-PRE")$uniqID
h2022 <- select(h2022, -uniqID_to_remove)
################## TRIAL RUN JUST PICKING OUT AND COMPARING PROLINE (AND SUITABILITY OF LOG2 TRANSFORM) ################
# find proline for a test run
h2020[grep("proline",h2020$BIOCHEMICAL),]
h2022[grep("proline",h2022$BIOCHEMICAL),]
# proline's comp id is 1898
proline2022 <- h2022[which(h2022[,3]==1898),]
proline2020 <- h2020[which(h2020[,3]==1898),]
# comparing normality assumptions with untransformed and log2 transformed data
proline2020_vec <- as.numeric(proline2020[1,-c(1:3)])
logpro20_vec <- log2(proline2020_vec)
hist(proline2020_vec)
qqnorm(proline2020_vec)
qqline(proline2020_vec)
# log fits better (for proline in 2020 data)
hist(logpro20_vec)
qqnorm(logpro20_vec)
qqline(logpro20_vec)
proline2022_vec <- as.numeric(proline2022[1,-c(1:3)])
logpro22_vec <- log2(proline2022_vec)
hist(proline2022_vec)
qqnorm(proline2022_vec)
qqline(proline2022_vec)
# log fits better (for proline in 2022 data)
hist(logpro22_vec)
qqnorm(logpro22_vec)
qqline(logpro22_vec)
# preprocessing dataframe so group names are consistent
pro20_long <- proline2020 %>% gather("group", "y", -c(BIOCHEMICAL,SUPER.PATHWAY,COMP.ID))
pro20_long <- pro20_long %>%
mutate(group = case_when(
startsWith(group, "C") ~ "Control",
startsWith(group, "P") ~ "CFS",
TRUE ~ group # if neither condition is met, keep the original value
))
pro22_long <- proline2022 %>% gather("uniqID", "y", -c(BIOCHEMICAL,'SUPER.PATHWAY','COMP.ID'))
pro22_long <- left_join(pro22_long,lookup22,by="uniqID") %>% select(BIOCHEMICAL,SUPER.PATHWAY,COMP.ID,group = Phenotype,y)
# combining data
pro22_long$paper <- rep("2022",nrow(pro22_long))
pro20_long$paper <- rep("2020",nrow(pro20_long))
pro_long <- bind_rows(pro20_long,pro22_long)
pro_long$logy <- log2(pro_long$y)
# making linear model with interaction term for proline
pro_m1 <- lm(logy ~ group + paper + group

aper,data=pro_long)
# can not say that the fold changed observed in on paper is different to the fold change observed in the other
summary(pro_m1)
ggplot(pro_long, aes(x=group,y=logy,color=paper)) +
geom_jitter(alpha=0.5) +
geom_violin()
############## GENERALISED FUNCTION FOR COMPARING METABOLITES AND INTERACTIONS WITH PAPER ##############
# function just generalised version of what was done for proline
get_met_model <- function(comp_id, log2_tranform=TRUE) {
met2022 <- h2022[which(h2022[,3]==comp_id),]
met2020 <- h2020[which(h2020[,3]==comp_id),]
met20_long <- met2020 %>% gather("group", "y", -c(BIOCHEMICAL,SUPER.PATHWAY,COMP.ID))
met20_long <- met20_long %>%
mutate(group = case_when(
startsWith(group, "C") ~ "Control",
startsWith(group, "P") ~ "CFS",
TRUE ~ group # if neither condition is met, keep the original value
))
met22_long <- met2022 %>% gather("uniqID", "y", -c(BIOCHEMICAL,SUPER.PATHWAY,COMP.ID))
met22_long <- left_join(met22_long,lookup22,by="uniqID") %>%
select(BIOCHEMICAL,SUPER.PATHWAY,COMP.ID,group = Phenotype,y)
met22_long$paper <- rep("2022",nrow(met22_long))
met20_long$paper <- rep("2020",nrow(met20_long))
#print(met20_long)
#print(met22_long)
met_long <- bind_rows(met20_long,met22_long)
met_long$logy <- log2(met_long$y)
met_m1 <- lm(logy ~ group + paper + group

aper,data=met_long)
#summary(met_m1)
# pulling out relevant coefficients and p values from the model and returning them
FC_2020 <- coef(summary(met_m1))[2,1]
FC_2022 <- coef(summary(met_m1))[2,1]+coef(summary(met_m1))[4,1]
FCdiff_p <- coef(summary(met_m1))[4,4]
FC_p <- coef(summary(met_m1))[2,4]
terms <- c(comp_id, FC_2020, FC_2022, FCdiff_p, FC_p)
terms
}
# sanity check: test output matches proline summary from model above
m1 <- get_met_model(comp_id=1898)
summary(pro_m1)
m1
# it does!
#################### CALCULATE P VALS, INTERACTION P VALS ETC FOR EVERY METABOLITE ###################
# go through every common metabolite between the two papers and calculate parameters
common_compids <- intersect(h2022$COMP.ID,h2020$COMP.ID)
metabolite_interactions <- matrix(NA, length(common_compids),5)
for (i in 1:length(common_compids)) {
terms <- get_met_model(comp_id=common_compids
)
metabolite_interactions[i,1] <- terms[1]
metabolite_interactions[i,2] <- terms[2]
metabolite_interactions[i,3] <- terms[3]
metabolite_interactions[i,4] <- terms[4]
metabolite_interactions[i,5] <- terms[5]
}
# make tibble, change names, put log2(FC) estimates to 2^ to get FC estimates,
metabolite_interactions <- metabolite_interactions %>% as_tibble()
names(metabolite_interactions) <- c("COMP.ID","FC_2020","FC_2022","FCdiff_p","FC_p")
metabolite_interactions <- metabolite_interactions %>% mutate_at(vars(FC_2020:FC_2022), funs(2^.))
# df of all metabolites in both papers and their names etc
h_all <- bind_rows(h2022[,c(1:3)],h2020[,c(1:3)]) %>% distinct()
# add names to metabolite_interactions df
h_met_interactions <- left_join(metabolite_interactions, h_all, by="COMP.ID")
# add Agree column for if interaction p val < 0.05
h_met_interactions <- h_met_interactions %>%
mutate(Agree = ifelse(FCdiff_p < 0.05, 'N', 'Y')) %>%
mutate(significant = ifelse(FC_p < 0.05, 'Y', 'N'))
# multiple test correct p vals
h_met_interactions$FC_q <- p.adjust(h_met_interactions$FC_p, method="BH")
############### PLOTTING ##################
plot1 <- ggplot(h_met_interactions, aes(x=FC_2020, y=FC_2022,colour=Agree,shape=significant)) +
geom_point(size=3) +
geom_hline(yintercept=1) +
geom_vline(xintercept=1) +
theme_minimal() +
labs(x="Germain & Hanson et al 2020", y="Germain & Hanson et al 2022") +
scale_colour_manual(values = c("N" = "#7BAFD4", "Y" = "#FB9A99")) +
ggtitle("All metabolites found in both papers")
#ggsave("allmetabs_bothpapers.png",plot1,bg="white")
plot2 <- ggplot(h_met_interactions[which(h_met_interactions$significant=="Y"),], aes(x=FC_2020, y=FC_2022,colour=Agree)) +
geom_point(size=3,shape=17) +
geom_hline(yintercept=1) +
geom_vline(xintercept=1) +
theme_minimal() +
scale_colour_manual(values = c("N" = "#7BAFD4", "Y" = "#FB9A99")) +
labs(x="Germain & Hanson et al 2020", y="Germain & Hanson et al 2022") +
ggtitle("Metabolites found significant in this analysis")
#ggsave("significantmetabs_bothpapers.png",plot2,bg="white")
# plot 1 is all metabolites
plot1
# plot 2 is only significant metabolites
plot2
# to peruse
h_met_interactions %>% filter(significant == "Y" & Agree=="Y") %>% fix
h_met_interactions %>% arrange(FC_q) %>% fix