StatFunGen
diff --git a/‎…lation_Studies/10_OPERA_simulation.ipynb‎ ‎Simulation_Studies/10_Run_OPERA.ipynb‎Simulation_Studies/10_OPERA_simulation.ipynb renamed to Simulation_Studies/10_Run_OPERA.ipynb
Lines changed: 260 additions & 166 deletions b/‎…lation_Studies/10_OPERA_simulation.ipynb‎ ‎Simulation_Studies/10_Run_OPERA.ipynb‎Simulation_Studies/10_OPERA_simulation.ipynb renamed to Simulation_Studies/10_Run_OPERA.ipynb
Lines changed: 260 additions & 166 deletions
diff --git a/‎…n_Studies/11_OPERA_original_design.ipynb‎ ‎…s/11_Run_OPERA_Original_Proportion.ipynb‎Simulation_Studies/11_OPERA_original_design.ipynb renamed to Simulation_Studies/11_Run_OPERA_Original_Proportion.ipynb
Lines changed: 100 additions & 48 deletions b/‎…n_Studies/11_OPERA_original_design.ipynb‎ ‎…s/11_Run_OPERA_Original_Proportion.ipynb‎Simulation_Studies/11_OPERA_original_design.ipynb renamed to Simulation_Studies/11_Run_OPERA_Original_Proportion.ipynb
Lines changed: 100 additions & 48 deletions
@@ -7,7 +7,7 @@
     "kernel": "R"
    },
    "source": [
-    "# OPERA: simulation using original proportion configuration"
+    "# Running OPERA: OPERA: simulation using original proportion configuration"
    ]
   },
   {
@@ -42,44 +42,60 @@
    },
    "outputs": [],
    "source": [
-    "# 3 trait index\n",
+    "# Generate partitioned indices for 3-trait simulation\n",
+    "# (Mixed proportions across different causal configurations)\n",
+    "\n",
+    "# Load required libraries\n",
     "library(tidyverse)\n",
-    "total_indices <- 500\n",
-    "proportions <- c(0.78, 0.05, 0.05, 0.05, 0.02, 0.02, 0.02, 0.01)\n",
     "\n",
-    "# Calculate the exact number of indices for each group\n",
-    "group_sizes <- floor(total_indices * proportions)\n",
+    "# ----------------------\n",
+    "# Step 1: Partition indices based on predefined proportions\n",
+    "# ----------------------\n",
     "\n",
-    "# Adjust for any rounding errors\n",
-    "group_sizes[1] <- group_sizes[1] + (total_indices - sum(group_sizes))\n",
-    "\n",
-    "# Create a vector of group assignments\n",
-    "groups <- rep(1:8, times = group_sizes)\n",
+    "total_indices <- 500  # Total number of samples\n",
+    "proportions <- c(0.78, 0.05, 0.05, 0.05, 0.02, 0.02, 0.02, 0.01)  # Group proportions\n",
     "\n",
+    "# Calculate number of samples in each group\n",
+    "group_sizes <- floor(total_indices * proportions)\n",
+    "group_sizes[1] <- group_sizes[1] + (total_indices - sum(group_sizes))  # Correct rounding error\n",
     "\n",
-    "groups <- sample(groups)\n",
+    "# Assign each sample to a group\n",
+    "groups <- rep(1:8, times = group_sizes)\n",
+    "groups <- sample(groups)  # Shuffle assignments\n",
     "\n",
-    "# Create a list to store the indices for each group\n",
+    "# Create a list of indices for each group\n",
     "partitioned_indices <- lapply(1:8, function(i) which(groups == i) - 1)\n",
     "\n",
-    "# Name the list elements for clarity\n",
+    "# Name the groups\n",
     "names(partitioned_indices) <- paste(\"Group\", 1:8)\n",
     "\n",
-    "# Print the number of indices in each group\n",
+    "# Print number of indices per group\n",
     "sapply(partitioned_indices, length)\n",
-    "                              \n",
-    "                              \n",
-    "config = list(c(1,0,0,0),\n",
-    "             c(1,1,0,0),\n",
-    "             c(1,0,1,0),\n",
-    "             c(1, 0, 0, 1),\n",
-    "             c(1,1,1,0),\n",
-    "             c(1,1,0,1),\n",
-    "             c(1,0,1,1),\n",
-    "             c(1,1,1,1))\n",
-    "\n",
-    "\n",
-    "saveRDS(list(partitioned_indices = partitioned_indices, config = config), \"/home/hs3393/cloud_colocalization/simulation_data/opera_original_design/index/partitioned_index_3trait.rds\")\n"
+    "\n",
+    "# ----------------------\n",
+    "# Step 2: Define causal configurations for each group\n",
+    "# ----------------------\n",
+    "\n",
+    "\n",
+    "config = list(\n",
+    "    c(1, 0, 0, 0), \n",
+    "    c(1, 1, 0, 0),  \n",
+    "    c(1, 0, 1, 0),  \n",
+    "    c(1, 0, 0, 1),  \n",
+    "    c(1, 1, 1, 0),  \n",
+    "    c(1, 1, 0, 1), \n",
+    "    c(1, 0, 1, 1), \n",
+    "    c(1, 1, 1, 1)   \n",
+    ")\n",
+    "\n",
+    "# ----------------------\n",
+    "# Step 3: Save partitioned index and configuration\n",
+    "# ----------------------\n",
+    "\n",
+    "saveRDS(\n",
+    "    list(partitioned_indices = partitioned_indices, config = config),\n",
+    "    \"/home/hs3393/cloud_colocalization/simulation_data/opera_original_design/index/partitioned_index_3trait.rds\"\n",
+    ")\n"
    ]
   },
   {
@@ -198,14 +214,19 @@
     "output: f'{cwd:a}/{step_name}/sample_{_index}_opera_3trait.rds'\n",
     "task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'\n",
     "R:  expand = '${ }', stdout = f\"{_output:n}.stdout\", stderr = f\"{_output:n}.stderr\", container = container \n",
+    "    # --- Load necessary libraries ---\n",
     "    library(\"MASS\")\n",
     "    library(\"plink2R\")\n",
     "    library(\"dplyr\")\n",
     "    library(\"readr\")\n",
     "    library(\"tidyverse\")\n",
-    "    # source some functions to read matrix and inpute the missing data\n",
-    "    source(\"~/cloud_colocalization/simulation_code/simulate_linreg.R\")\n",
-    "    source(\"~/cloud_colocalization/simulation_code/misc.R\")\n",
+    "  \n",
+    "    # install simulation package\n",
+    "    # devtools::install_github(\"StatFunGen/simxQTL\", build_vignettes = FALSE)\n",
+    "    # BiocManager::install(\"StatFunGen/pecotmr\")\n",
+    "    library(\"pecotmr\")\n",
+    "    library(\"simxQTL\")\n",
+    "  \n",
     "    calculate_sumstat = function(X, Y){\n",
     "        Beta = c()\n",
     "        se = c()\n",
@@ -242,21 +263,32 @@
     "\n",
     "    indep = ${\"TRUE\" if independent else \"FALSE\"}\n",
     "    if (indep) {\n",
-    "        LD_vars = 1  # Initialize LD_vars\n",
+    "        LD_vars = 1  # Initialize LD check\n",
     "\n",
     "        if (ncausal == 1) {\n",
-    "            # If only one causal variant, just sample it\n",
+    "            # Only one causal variant needed\n",
     "            vars = sample(1:ncol(Xmat), size = ncausal)\n",
     "        } else {\n",
-    "            # Repeat sampling until selected variables are quasi independent\n",
+    "            # Ensure selected variants are approximately independent (LD < 0.3)\n",
     "            while (length(LD_vars != 0)) {\n",
-    "                vars = sample(1:ncol(Xmat), size = ncausal)  \n",
-    "                cor_mat = cor(Xmat[, vars]) \n",
+    "                vars = sample(1:ncol(Xmat), size = ncausal)\n",
+    "                cor_mat = cor(Xmat[, vars])\n",
     "                LD_vars = which(colSums(abs(cor_mat) > 0.3) > 1)\n",
     "            }\n",
     "        }\n",
     "    } else {\n",
-    "        vars = sample(1:ncol(Xmat), size = ncausal)\n",
+    "        LD_vars = 1  # Initialize LD check\n",
+    "\n",
+    "        if (ncausal == 1) {\n",
+    "            vars = sample(1:ncol(Xmat), size = ncausal)\n",
+    "        } else {\n",
+    "            # Avoid perfectly correlated variants (|cor| = 1)\n",
+    "            while (length(LD_vars != 0)) {\n",
+    "                vars = sample(1:ncol(Xmat), size = ncausal)\n",
+    "                cor_mat = cor(Xmat[, vars])\n",
+    "                LD_vars = which(colSums(abs(cor_mat) == 1) > 1)\n",
+    "            }\n",
+    "        }\n",
     "    }\n",
     "  \n",
     "    result <- sapply(partitioned_res$partitioned_indices, function(x) any(x == tad_number))\n",
@@ -326,14 +358,21 @@
     "output: f'{cwd:a}/{step_name}/sample_{_index}_opera_5trait.rds'\n",
     "task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'\n",
     "R:  expand = '${ }', stdout = f\"{_output:n}.stdout\", stderr = f\"{_output:n}.stderr\", container = container \n",
+    "    # --- Load necessary libraries ---\n",
     "    library(\"MASS\")\n",
     "    library(\"plink2R\")\n",
     "    library(\"dplyr\")\n",
     "    library(\"readr\")\n",
     "    library(\"tidyverse\")\n",
-    "    # source some functions to read matrix and inpute the missing data\n",
-    "    source(\"~/cloud_colocalization/simulation_code/simulate_linreg.R\")\n",
-    "    source(\"~/cloud_colocalization/simulation_code/misc.R\")\n",
+    "  \n",
+    "    # install simulation package\n",
+    "    # devtools::install_github(\"StatFunGen/simxQTL\", build_vignettes = FALSE)\n",
+    "    # BiocManager::install(\"StatFunGen/pecotmr\")\n",
+    "    library(\"pecotmr\")\n",
+    "    library(\"simxQTL\")\n",
+    "    \n",
+    "    # --- Load helper functions ---\n",
+    "    source(\"~/simxQTL/simulate_linreg.R\")\n",
     "    calculate_sumstat = function(X, Y){\n",
     "        Beta = c()\n",
     "        se = c()\n",
@@ -370,21 +409,32 @@
     "\n",
     "    indep = ${\"TRUE\" if independent else \"FALSE\"}\n",
     "    if (indep) {\n",
-    "        LD_vars = 1  # Initialize LD_vars\n",
+    "        LD_vars = 1  # Initialize LD check\n",
     "\n",
     "        if (ncausal == 1) {\n",
-    "            # If only one causal variant, just sample it\n",
+    "            # Only one causal variant needed\n",
     "            vars = sample(1:ncol(Xmat), size = ncausal)\n",
     "        } else {\n",
-    "            # Repeat sampling until selected variables are quasi independent\n",
+    "            # Ensure selected variants are approximately independent (LD < 0.3)\n",
     "            while (length(LD_vars != 0)) {\n",
-    "                vars = sample(1:ncol(Xmat), size = ncausal)  \n",
-    "                cor_mat = cor(Xmat[, vars]) \n",
+    "                vars = sample(1:ncol(Xmat), size = ncausal)\n",
+    "                cor_mat = cor(Xmat[, vars])\n",
     "                LD_vars = which(colSums(abs(cor_mat) > 0.3) > 1)\n",
     "            }\n",
     "        }\n",
     "    } else {\n",
-    "        vars = sample(1:ncol(Xmat), size = ncausal)\n",
+    "        LD_vars = 1  # Initialize LD check\n",
+    "\n",
+    "        if (ncausal == 1) {\n",
+    "            vars = sample(1:ncol(Xmat), size = ncausal)\n",
+    "        } else {\n",
+    "            # Avoid perfectly correlated variants (|cor| = 1)\n",
+    "            while (length(LD_vars != 0)) {\n",
+    "                vars = sample(1:ncol(Xmat), size = ncausal)\n",
+    "                cor_mat = cor(Xmat[, vars])\n",
+    "                LD_vars = which(colSums(abs(cor_mat) == 1) > 1)\n",
+    "            }\n",
+    "        }\n",
     "    }\n",
     "  \n",
     "    result <- sapply(partitioned_res$partitioned_indices, function(x) any(x == tad_number))\n",
@@ -998,9 +1048,11 @@
     "output: f'{cwd:a}/{_input[0]:bn}_ntrait_{trait}_{step_name}.rds'\n",
     "task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'\n",
     "R:  expand = '${ }', stdout = f\"{_output:n}.stdout\", stderr = f\"{_output:n}.stderr\", container = container \n",
-    "    for(file in list.files(\"/home/xc2270/COLOCBoost/code_COLOCBoost/colocboost_updating/\", full.names = T)) {source(file)}\n",
     "    library(tidyverse)\n",
-    "    data = readRDS(${_input:ar})\n",
+    "    # --- Load colocboost function files ---\n",
+    "    # devtools::install_github(\"StatFunGen/colocboost\")\n",
+    "    # for(file in list.files(\"/home/xc2270/COLOCBoost/code_COLOCBoost/release\", full.names = T)) {source(file)}\n",
+    "    library(\"colocboost\")\n",
     "    if(length(data$trait) < 5){\n",
     "        trait1 = data$trait[[1]] %>% mutate(beta = b, sebeta = se, n = N, variant = SNP) %>% select(beta, sebeta, n, variant) %>% as.data.frame()\n",
     "        trait2 = data$trait[[2]] %>% mutate(beta = Beta, sebeta = se, n = 1160, variant = SNP)%>% select(beta, sebeta, n, variant) %>% as.data.frame()\n",