From 09418174ba3fffa753397b3b4f4137d67c1cefc9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Tue, 17 Feb 2026 09:48:39 +0100 Subject: [PATCH 1/7] app: load revel and alphamissense in the collection missense_variation_functional_score, #TASK-8163 --- .../admin/executors/LoadCommandExecutor.java | 29 ++++++++++++++++--- .../lib/loader/MongoDBCellBaseLoader.java | 20 +++++++++---- 2 files changed, 39 insertions(+), 10 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java index 5f03ddc96..6ed95ec54 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java @@ -22,6 +22,7 @@ import org.opencb.cellbase.app.cli.admin.AdminCliOptionsParser; import org.opencb.cellbase.core.config.SpeciesConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.models.DataSource; import org.opencb.cellbase.core.models.Release; import org.opencb.cellbase.core.result.CellBaseDataResult; import org.opencb.cellbase.core.utils.DatabaseNameUtils; @@ -359,17 +360,25 @@ private void loadProteinFunctionalPrediction() throws NoSuchMethodException, Int } private void loadRevel() throws CellBaseException { + // Check if REVEL source has already been loaded + checkSourceAlreadyLoaded(REVEL_DATA); + HashMap collectionMap = new HashMap<>(); - collectionMap.put(PROTEIN_SUBSTITUTION_PREDICTION_DATA, REVEL_DATA + JSON_GZ_EXTENSION); + collectionMap.put(MISSENSE_VARIATION_SCORE_DATA, REVEL_DATA + JSON_GZ_EXTENSION); - loadData(input.resolve(PROTEIN_SUBSTITUTION_PREDICTION_DATA).resolve(REVEL_DATA), collectionMap); + Path revelPath = input.resolve(PROTEIN_SUBSTITUTION_PREDICTION_DATA).resolve(REVEL_DATA); + loadData(revelPath, collectionMap); } private void loadAlphaMissense() throws CellBaseException { + // Check if AlphaMissense source has already been loaded + checkSourceAlreadyLoaded(ALPHAMISSENSE_DATA); + HashMap collectionMap = new HashMap<>(); - collectionMap.put(PROTEIN_SUBSTITUTION_PREDICTION_DATA, ALPHAMISSENSE_DATA + JSON_GZ_EXTENSION); + collectionMap.put(MISSENSE_VARIATION_SCORE_DATA, ALPHAMISSENSE_DATA + JSON_GZ_EXTENSION); - loadData(input.resolve(PROTEIN_SUBSTITUTION_PREDICTION_DATA).resolve(ALPHAMISSENSE_DATA), collectionMap); + Path alphaMissensePath = input.resolve(PROTEIN_SUBSTITUTION_PREDICTION_DATA).resolve(ALPHAMISSENSE_DATA); + loadData(alphaMissensePath, collectionMap); } private void loadClinical() throws FileNotFoundException { @@ -677,4 +686,16 @@ private Release getDataReleaseForLoading(DataReleaseManager dataReleaseManager) } return lastDataRelease; } + + private void checkSourceAlreadyLoaded(String sourceId) throws CellBaseException { + Release release = getDataReleaseForLoading(dataReleaseManager); + if (release.getSources() != null) { + for (DataSource source : release.getSources()) { + if (sourceId.equalsIgnoreCase(source.getId())) { + throw new CellBaseException("Loading data '" + sourceId + "' with release " + dataRelease + + " failed: source '" + sourceId + "' already loaded previously"); + } + } + } + } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/loader/MongoDBCellBaseLoader.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/loader/MongoDBCellBaseLoader.java index 2c0bb84da..062987922 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/loader/MongoDBCellBaseLoader.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/loader/MongoDBCellBaseLoader.java @@ -51,6 +51,8 @@ import java.util.concurrent.BlockingQueue; import java.util.stream.Collectors; +import static org.opencb.cellbase.lib.EtlCommons.MISSENSE_VARIATION_SCORE_DATA; + /** * Created by parce on 18/02/15. */ @@ -137,13 +139,19 @@ private String getCollectionName() throws LoaderException { throw new LoaderException("Invalid data release " + dataRelease + " for database " + database + ". Available releases" + " are: " + StringUtils.join(releases, ",")); } - for (Release dr : result.getResults()) { - if (dr.getRelease() == dataRelease) { - if (dr.getCollections().containsKey(data) && dr.getCollections().get(data).equals(collectionName)) { - throw new LoaderException("Loading new data " + data + " with release " + dataRelease - + " (already populated previously)"); + + // Sanity check don't populate collections already populated + // Missense variation score data (i.e., revel and alphaMissense) is checked later, since revel and alphamissense are loaded + // in the same collection but independently + if (!data.equalsIgnoreCase(MISSENSE_VARIATION_SCORE_DATA)) { + for (Release dr : result.getResults()) { + if (dr.getRelease() == dataRelease) { + if (dr.getCollections().containsKey(data) && dr.getCollections().get(data).equals(collectionName)) { + throw new LoaderException("Loading new data '" + data + "' with release " + dataRelease + + " (already populated previously)"); + } + break; } - break; } } From 91df0296fb576d6dcafa68a683612806127d7c70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Tue, 17 Feb 2026 13:48:47 +0100 Subject: [PATCH 2/7] lib: fix retreive for sift, polyphen, revel and alphamissense socres, #TASK-8163 --- .../lib/impl/core/ProteinMongoDBAdaptor.java | 296 ++++++++++++------ .../cellbase/lib/managers/ProteinManager.java | 6 +- .../src/main/resources/mongodb-indexes.json | 1 + .../test/resources/index/mongodb-indexes.json | 1 + 4 files changed, 210 insertions(+), 94 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/ProteinMongoDBAdaptor.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/ProteinMongoDBAdaptor.java index 2d3efefa9..23061ad50 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/ProteinMongoDBAdaptor.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/ProteinMongoDBAdaptor.java @@ -18,13 +18,16 @@ import com.mongodb.BasicDBList; import com.mongodb.client.model.Filters; +import com.mongodb.client.model.Projections; import org.apache.commons.collections4.CollectionUtils; +import org.apache.commons.collections4.MapUtils; import org.apache.commons.lang3.StringUtils; import org.bson.Document; import org.bson.conversions.Bson; import org.opencb.biodata.formats.protein.uniprot.v202003jaxb.Entry; import org.opencb.biodata.models.core.ProteinSubstitutionPrediction; import org.opencb.biodata.models.core.ProteinSubstitutionPredictionScore; +import org.opencb.biodata.models.variant.Variant; import org.opencb.biodata.models.variant.avro.ProteinFeature; import org.opencb.biodata.models.variant.avro.ProteinVariantAnnotation; import org.opencb.biodata.models.variant.avro.Score; @@ -36,6 +39,7 @@ import org.opencb.cellbase.core.result.CellBaseDataResult; import org.opencb.cellbase.lib.iterator.CellBaseIterator; import org.opencb.cellbase.lib.iterator.CellBaseMongoDBIterator; +import org.opencb.cellbase.lib.variant.VariantAnnotationUtils; import org.opencb.commons.datastore.core.DataResult; import org.opencb.commons.datastore.core.Query; import org.opencb.commons.datastore.core.QueryOptions; @@ -44,6 +48,9 @@ import java.util.*; +import static org.opencb.cellbase.lib.EtlCommons.ALPHAMISSENSE_DATA; +import static org.opencb.cellbase.lib.EtlCommons.REVEL_DATA; + /** * Created by imedina on 01/12/15. */ @@ -51,7 +58,7 @@ public class ProteinMongoDBAdaptor extends CellBaseDBAdaptor implements CellBase private Map proteinSubstitutionMongoDBCollectionByRelease; - private static final int NUM_PROTEIN_SUBSTITUTION_SCORE_METHODS = 2; + private static final int NUM_PROTEIN_SUBSTITUTION_SCORE_METHODS = 4; private static Map aaShortNameMap; @@ -94,24 +101,157 @@ private void init() { proteinSubstitutionMongoDBCollectionByRelease = buildCollectionByReleaseMap("protein_substitution_prediction"); } - public CellBaseDataResult getSubstitutionScores(TranscriptQuery query, Integer aaPosition, String aa) throws CellBaseException { - long dbTimeStart = System.currentTimeMillis(); - Map scoreSet = new HashMap<>(); - - // transcriptId, aaPosition, aaAlternate are needed for this collection - if (query.getTranscriptsId() != null && query.getTranscriptsId().get(0) != null && aaPosition != null - && StringUtils.isNotEmpty(aa)) { + public CellBaseDataResult getSubstitutionScores(TranscriptQuery query, String chromosome, Integer position, Integer aaPosition, + String aa) throws CellBaseException { + CellBaseDataResult result = null; + // Ensembl transcript id is needed for this collection + if (query.getTranscriptsId() != null && query.getTranscriptsId().get(0) != null) { + String transcriptId = query.getTranscriptsId().get(0).split("\\.")[0]; + Bson transcript = Filters.eq("transcriptId", transcriptId); MongoDBCollection mongoDBCollection = getCollectionByRelease(proteinSubstitutionMongoDBCollectionByRelease, query.getDataRelease()); - List andBsonList = new ArrayList<>(); + String aaShortName = null; + // If position and aa change are provided we create a 'projection' to return only the required data from the database + if (aaPosition != null) { + String projectionString = "aaPositions." + aaPosition; + + // If aa change is provided we only return that information + if (StringUtils.isNotEmpty(aa)) { + aaShortName = aaShortNameMap.get(aa.toUpperCase()); + projectionString += "." + aaShortName; + } + + // Projection is used to minimize the returned data + Bson positionProjection = Projections.include(projectionString); + result = new CellBaseDataResult<>(mongoDBCollection.find(transcript, positionProjection, query.toQueryOptions())); + } else { + // Return the whole transcript data + result = new CellBaseDataResult<>(mongoDBCollection.find(transcript, query.toQueryOptions())); + } + + if (result != null && !result.getResults().isEmpty()) { + Document document = (Document) result.getResults().get(0); + Document aaPositionsDocument = (Document) document.get("aaPositions"); + + // Position or aa change were not provided, returning whole transcript data + if (aaPosition == null || aaPosition == -1 || aaShortName == null) { + // Return only the inner Document, not the whole document projected + result.setResults(Collections.singletonList(aaPositionsDocument)); + // Position and aa were provided, return only corresponding Score objects + } else { + List scoreList = new ArrayList<>(NUM_PROTEIN_SUBSTITUTION_SCORE_METHODS); + if (result.getNumResults() == 1 && aaPositionsDocument != null) { + Document positionDocument = (Document) aaPositionsDocument.get(Integer.toString(aaPosition)); + if (positionDocument != null) { + Document aaDocument = (Document) positionDocument.get(aaShortName); + if (aaDocument != null) { + if (aaDocument.get("ss") != null) { + scoreList.add(new Score(Double.parseDouble("" + aaDocument.get("ss")), + "sift", VariantAnnotationUtils.SIFT_DESCRIPTIONS.get(aaDocument.get("se")))); + } + if (aaDocument.get("ps") != null) { + scoreList.add(new Score(Double.parseDouble("" + aaDocument.get("ps")), + "polyphen", VariantAnnotationUtils.POLYPHEN_DESCRIPTIONS.get(aaDocument.get("pe")))); + } + } + } + } + + // Query for REVEL and ALPHAMISSENSE scores (different data model) + scoreList.addAll(getRevelAndAlphaMissenseScores(mongoDBCollection, chromosome, position, transcriptId, aaPosition, aa)); + + result.setResults(scoreList); + } + } + } + + // Return null if no transcript id is provided + return result; + } + +// private List getRevelAndAlphaMissenseScores(MongoDBCollection mongoDBCollection, String transcriptId, +// Integer position, String aa) { +// List scoreList = new ArrayList<>(); +// +// if (position == null || StringUtils.isEmpty(aa)) { +// return scoreList; +// } +// +// // Query for documents with source field (REVEL, ALPHAMISSENSE) +// Bson filter = Filters.and( +// Filters.eq("transcriptId", transcriptId), +// Filters.eq("aaPosition", position), +// Filters.in("source", "revel", "alphamissense") +// ); +// +// try { +// CellBaseDataResult documents = new CellBaseDataResult<>( +// mongoDBCollection.find(filter, new QueryOptions()) +// ); +// +// if (documents != null && !documents.getResults().isEmpty()) { +// for (Document document : documents.getResults()) { +// String source = (String) document.get("source"); +// List scoresArray = document.getList("scores", Document.class); +// +// if (scoresArray != null) { +// for (Document scoreDoc : scoresArray) { +// String aaAlternate = (String) scoreDoc.get("aaAlternate"); +// +// // Check if this score matches the requested alternate AA +// if (aa.equals(aaAlternate)) { +// Object scoreValue = scoreDoc.get("score"); +// if (scoreValue != null) { +// double score = Double.parseDouble(scoreValue.toString()); +// String description = (String) scoreDoc.get("effect"); +// scoreList.add(new Score(score, source, description)); +// } +// } +// } +// } +// } +// } +// } catch (Exception e) { +// logger.debug("Error retrieving REVEL and ALPHAMISSENSE scores for transcriptId: {}, position: {}", +// transcriptId, position, e); +// } +// +// return scoreList; +// } + + + private List getRevelAndAlphaMissenseScores(MongoDBCollection mongoDBCollection, String chromosome, Integer position, + String transcriptId, Integer aaPosition, String aa) { + Map scoreSet = new HashMap<>(); + + // aaPosition/position, aaAlternate are needed for this collection + if (transcriptId != null && StringUtils.isNotEmpty(aa)) { // Sanity check, protein substitution predictions do not contain the transcript ID version - String transcriptId = query.getTranscriptsId().get(0).split("\\.")[0]; - andBsonList.add(Filters.eq("transcriptId", transcriptId)); - andBsonList.add(Filters.eq("aaPosition", aaPosition)); + transcriptId = transcriptId.split("\\.")[0]; + + List andBsonList = new ArrayList<>(); + // Query without transcriptId filter (will filter in Java for better performance) + andBsonList.add(Filters.eq("chromosome", chromosome)); + andBsonList.add(Filters.in("source", REVEL_DATA, ALPHAMISSENSE_DATA)); + + // Efficient single OR query: aaPosition for ALPHAMISSENSE, position for REVEL + List orBsonList = new ArrayList<>(); + if (aaPosition != null) { + orBsonList.add(Filters.eq("aaPosition", aaPosition)); // ALPHAMISSENSE + } + if (position != null) { + orBsonList.add(Filters.eq("position", position)); // REVEL + } + if (!orBsonList.isEmpty()) { + andBsonList.add(Filters.or(orBsonList)); + } + String aaAlternate = aaShortNameMap.get(aa.toUpperCase()); - andBsonList.add(Filters.eq("scores.aaAlternate", aaAlternate)); + if (aaAlternate != null) { + andBsonList.add(Filters.eq("scores.aaAlternate", aaAlternate)); + } Bson bson = Filters.and(andBsonList); DataResult predictions = mongoDBCollection.find(bson, null, ProteinSubstitutionPrediction.class, @@ -119,6 +259,11 @@ public CellBaseDataResult getSubstitutionScores(TranscriptQuery query, In if (predictions != null && CollectionUtils.isNotEmpty(predictions.getResults())) { for (ProteinSubstitutionPrediction prediction : predictions.getResults()) { + // Filter by transcriptId in Java (handles both single and semicolon-separated values) + if (!isTranscriptIdMatch(prediction.getTranscriptId(), transcriptId)) { + continue; + } + for (ProteinSubstitutionPredictionScore predictionScore : prediction.getScores()) { if (StringUtils.isNotEmpty(predictionScore.getAaAlternate()) && StringUtils.isNotEmpty(aaAlternate) && predictionScore.getAaAlternate().equals(aaAlternate)) { @@ -133,93 +278,62 @@ public CellBaseDataResult getSubstitutionScores(TranscriptQuery query, In } } - int dbTime = Long.valueOf(System.currentTimeMillis() - dbTimeStart).intValue(); - return new CellBaseDataResult<>("getSubstitutionScores", dbTime, new ArrayList<>(), scoreSet.size(), - new ArrayList<>(scoreSet.values()), scoreSet.size()); + if (MapUtils.isEmpty(scoreSet)) { + return new ArrayList<>(); + } else { + return new ArrayList<>(scoreSet.values()); + } } -// public CellBaseDataResult getSubstitutionScores(Query query, QueryOptions options) { -// CellBaseDataResult result = null; -// -// // Ensembl transcript id is needed for this collection -// if (query.getString("transcript") != null) { -// Bson transcript = Filters.eq("transcriptId", query.getString("transcript")); -// -// int position = -1; -// String aaShortName = null; -// // If position and aa change are provided we create a 'projection' to return only the required data from the database -// if (query.get("position") != null && !query.getString("position").isEmpty() && query.getInt("position", 0) != 0) { -// position = query.getInt("position"); -// String projectionString = "aaPositions." + position; -// -// // If aa change is provided we only return that information -// if (query.getString("aa") != null && !query.getString("aa").isEmpty()) { -// aaShortName = aaShortNameMap.get(query.getString("aa").toUpperCase()); -// projectionString += "." + aaShortName; -// } -// -// // Projection is used to minimize the returned data -// Bson positionProjection = Projections.include(projectionString); -// result = new CellBaseDataResult<>(proteinSubstitutionMongoDBCollection.find(transcript, positionProjection, options)); -// } else { -// // Return the whole transcript data -// result = new CellBaseDataResult<>(proteinSubstitutionMongoDBCollection.find(transcript, options)); -// } -// -// if (result != null && !result.getResults().isEmpty()) { -// Document document = (Document) result.getResults().get(0); -// Document aaPositionsDocument = (Document) document.get("aaPositions"); -// -// // Position or aa change were not provided, returning whole transcript data -// if (position == -1 || aaShortName == null) { -// // Return only the inner Document, not the whole document projected -// result.setResults(Collections.singletonList(aaPositionsDocument)); -// // Position and aa were provided, return only corresponding Score objects -// } else { -// List scoreList = null; -// if (result.getNumResults() == 1 && aaPositionsDocument != null) { -// scoreList = new ArrayList<>(NUM_PROTEIN_SUBSTITUTION_SCORE_METHODS); -// Document positionDocument = (Document) aaPositionsDocument.get(Integer.toString(position)); -// Document aaDocument = (Document) positionDocument.get(aaShortName); -// if (aaDocument.get("ss") != null) { -// scoreList.add(new Score(Double.parseDouble("" + aaDocument.get("ss")), -// "sift", VariantAnnotationUtils.SIFT_DESCRIPTIONS.get(aaDocument.get("se")))); -// } -// if (aaDocument.get("ps") != null) { -// scoreList.add(new Score(Double.parseDouble("" + aaDocument.get("ps")), -// "polyphen", VariantAnnotationUtils.POLYPHEN_DESCRIPTIONS.get(aaDocument.get("pe")))); -// } -// } -// result.setResults(scoreList); -// } -// } -// } -// // Return null if no transcript id is provided -// return result; -// -// } + /** + * Check if transcriptId matches the stored value. + * Handles both single transcriptId and semicolon-separated list (REVEL format) + * @param storedTranscriptId the value from the database (can be "ENST..." or "ENST...;ENST...;ENST...") + * @param queryTranscriptId the transcript ID we're looking for + * @return true if queryTranscriptId is found in storedTranscriptId + */ + private boolean isTranscriptIdMatch(String storedTranscriptId, String queryTranscriptId) { + if (StringUtils.isEmpty(storedTranscriptId) || StringUtils.isEmpty(queryTranscriptId)) { + return false; + } - public CellBaseDataResult getVariantAnnotation(String ensemblTranscriptId, int position, String aaReference, - String aaAlternate, QueryOptions options, int dataRelease) + // For single value (ALPHAMISSENSE) or exact match + if (storedTranscriptId.equals(queryTranscriptId)) { + return true; + } + + // For semicolon-separated list (REVEL) + // Split and check if queryTranscriptId is in the list + String[] transcriptIds = storedTranscriptId.split(";"); + for (String id : transcriptIds) { + if (id.equals(queryTranscriptId)) { + return true; + } + } + + return false; + } + + public CellBaseDataResult getVariantAnnotation(Variant variant, String ensemblTranscriptId, int aaPosition, + String aaReference, String aaAlternate, QueryOptions options, + int dataRelease) throws CellBaseException { CellBaseDataResult cellBaseDataResult = new CellBaseDataResult<>(); - cellBaseDataResult.setId(ensemblTranscriptId + "/" + position + "/" + aaAlternate); + cellBaseDataResult.setId(ensemblTranscriptId + "/" + aaPosition + "/" + aaAlternate); long dbTimeStart = System.currentTimeMillis(); ProteinVariantAnnotation proteinVariantAnnotation = new ProteinVariantAnnotation(); - proteinVariantAnnotation.setPosition(position); + proteinVariantAnnotation.setPosition(aaPosition); proteinVariantAnnotation.setReference(aaReference); proteinVariantAnnotation.setAlternate(aaAlternate); -// Query query = new Query("transcript", ensemblTranscriptId).append("position", position).append("aa", aaAlternate); // Stop_gain/lost variants do not have SIFT/POLYPHEN scores -// System.out.println("aaReference = " + aaReference); -// System.out.println("aaAlternate = " + aaAlternate); -// if (!aaAlternate.equals("STOP") && !aaReference.equals("STOP")) { + if (!aaAlternate.equals("STOP") && !aaReference.equals("STOP")) { TranscriptQuery query = new TranscriptQuery(); query.setTranscriptsId(Collections.singletonList(ensemblTranscriptId)); query.setDataRelease(dataRelease); - proteinVariantAnnotation.setSubstitutionScores(getSubstitutionScores(query, position, aaAlternate).getResults()); -// } + proteinVariantAnnotation.setSubstitutionScores(getSubstitutionScores(query, variant.getChromosome(), variant.getStart(), + aaPosition, aaAlternate).getResults()); + } CellBaseDataResult proteinVariantData; String shortAlternativeAa = aaShortNameMap.get(aaAlternate); @@ -236,12 +350,12 @@ public CellBaseDataResult getVariantAnnotation(String pipeline.add(new Document("$unwind", "$feature")); BasicDBList andDBList2 = new BasicDBList(); - andDBList2.add(new Document("feature.location.position.position", position)); + andDBList2.add(new Document("feature.location.position.position", aaPosition)); andDBList2.add(new Document("feature.variation", shortAlternativeAa)); Document firstOr = new Document("$and", andDBList2); BasicDBList andDBList3 = new BasicDBList(); - andDBList3.add(new Document("feature.location.end.position", new Document("$gte", position))); - andDBList3.add(new Document("feature.location.begin.position", new Document("$lte", position))); + andDBList3.add(new Document("feature.location.end.position", new Document("$gte", aaPosition))); + andDBList3.add(new Document("feature.location.begin.position", new Document("$lte", aaPosition))); Document secondOr = new Document(); secondOr.put("$and", andDBList3); BasicDBList orList = new BasicDBList(); @@ -256,8 +370,8 @@ public CellBaseDataResult getVariantAnnotation(String pipeline.add(new Document("$group", groupFields)); MongoDBCollection mongoDBCollection = getCollectionByRelease(mongoDBCollectionByRelease, dataRelease); - proteinVariantData = executeAggregation2(ensemblTranscriptId + "_" + String.valueOf(position) + "_" - + aaAlternate, pipeline, new QueryOptions(), mongoDBCollection); + proteinVariantData = executeAggregation2(ensemblTranscriptId + "_" + aaPosition + "_" + aaAlternate, pipeline, + new QueryOptions(), mongoDBCollection); if (proteinVariantData.getNumResults() > 0) { proteinVariantAnnotation = processProteinVariantData(proteinVariantAnnotation, shortAlternativeAa, (Document) proteinVariantData.getResults().get(0)); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/ProteinManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/ProteinManager.java index e1a068147..1fce32a06 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/ProteinManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/ProteinManager.java @@ -76,7 +76,7 @@ public CellBaseDataResult getSubstitutionScores(TranscriptQuery query, Integer p if (queryResult.getNumResults() > 0) { String transcriptId = queryResult.getResults().get(0).getId(); query.setTranscriptsId(Collections.singletonList(transcriptId)); - CellBaseDataResult scoresCellBaseDataResult = proteinDBAdaptor.getSubstitutionScores(query, position, aa); + CellBaseDataResult scoresCellBaseDataResult = proteinDBAdaptor.getSubstitutionScores(query, null, null, position, aa); scoresCellBaseDataResult.setId(transcriptId); return scoresCellBaseDataResult; } else { @@ -101,8 +101,8 @@ public CellBaseDataResult getSequence(ProteinQuery query) throws CellBas public CellBaseDataResult getVariantAnnotation(Variant variant, String ensemblTranscriptId, int aaPosition, String aaReference, String aaAlternate, QueryOptions options, int dataRelease) throws CellBaseException { - CellBaseDataResult proteinVariantAnnotation = proteinDBAdaptor.getVariantAnnotation(ensemblTranscriptId, - aaPosition, aaReference, aaAlternate, options, dataRelease); + CellBaseDataResult proteinVariantAnnotation = proteinDBAdaptor.getVariantAnnotation(variant, + ensemblTranscriptId, aaPosition, aaReference, aaAlternate, options, dataRelease); return proteinVariantAnnotation; } diff --git a/cellbase-lib/src/main/resources/mongodb-indexes.json b/cellbase-lib/src/main/resources/mongodb-indexes.json index cfc5164e9..19ac4e7da 100644 --- a/cellbase-lib/src/main/resources/mongodb-indexes.json +++ b/cellbase-lib/src/main/resources/mongodb-indexes.json @@ -185,6 +185,7 @@ {"collection": "protein_substitution_prediction", "fields": {"uniprotId": 1}, "options": {"background": true}} {"collection": "protein_substitution_prediction", "fields": {"transcriptId": 1}, "options": {"background": true}} {"collection": "protein_substitution_prediction", "fields": {"aaPosition": 1}, "options": {"background": true}} +{"collection": "protein_substitution_prediction", "fields": {"chromosome": 1, "source": 1, "aaPosition": 1, "position": 1}, "options": {"background": true}} {"collection": "common_polygenic_score", "fields": {"id": 1}, "options": {"background": true}} {"collection": "common_polygenic_score", "fields": {"name": 1}, "options": {"background": true}} diff --git a/cellbase-lib/src/test/resources/index/mongodb-indexes.json b/cellbase-lib/src/test/resources/index/mongodb-indexes.json index cfc5164e9..19ac4e7da 100644 --- a/cellbase-lib/src/test/resources/index/mongodb-indexes.json +++ b/cellbase-lib/src/test/resources/index/mongodb-indexes.json @@ -185,6 +185,7 @@ {"collection": "protein_substitution_prediction", "fields": {"uniprotId": 1}, "options": {"background": true}} {"collection": "protein_substitution_prediction", "fields": {"transcriptId": 1}, "options": {"background": true}} {"collection": "protein_substitution_prediction", "fields": {"aaPosition": 1}, "options": {"background": true}} +{"collection": "protein_substitution_prediction", "fields": {"chromosome": 1, "source": 1, "aaPosition": 1, "position": 1}, "options": {"background": true}} {"collection": "common_polygenic_score", "fields": {"id": 1}, "options": {"background": true}} {"collection": "common_polygenic_score", "fields": {"name": 1}, "options": {"background": true}} From fc14c9b5aa012543cf28ed3aa371aa05aace6bed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Tue, 17 Feb 2026 16:19:54 +0100 Subject: [PATCH 3/7] lib: improve sift/polyphen query by creating a new MongoDB index, #TASK-8163 --- .../lib/impl/core/ProteinMongoDBAdaptor.java | 57 ++----------------- .../src/main/resources/mongodb-indexes.json | 1 + .../test/resources/index/mongodb-indexes.json | 1 + 3 files changed, 7 insertions(+), 52 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/ProteinMongoDBAdaptor.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/ProteinMongoDBAdaptor.java index 23061ad50..3afbbc0cd 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/ProteinMongoDBAdaptor.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/ProteinMongoDBAdaptor.java @@ -108,7 +108,11 @@ public CellBaseDataResult getSubstitutionScores(TranscriptQuery query, St // Ensembl transcript id is needed for this collection if (query.getTranscriptsId() != null && query.getTranscriptsId().get(0) != null) { String transcriptId = query.getTranscriptsId().get(0).split("\\.")[0]; - Bson transcript = Filters.eq("transcriptId", transcriptId); + // Filter for SIFT/POLYPHEN documents: they have the 'size' field, REVEL/ALPHAMISSENSE don't + List andBsonList = new ArrayList<>(); + andBsonList.add(Filters.eq("transcriptId", transcriptId)); + andBsonList.add(Filters.exists("size", true)); + Bson transcript = Filters.and(andBsonList); MongoDBCollection mongoDBCollection = getCollectionByRelease(proteinSubstitutionMongoDBCollectionByRelease, query.getDataRelease()); @@ -171,57 +175,6 @@ public CellBaseDataResult getSubstitutionScores(TranscriptQuery query, St return result; } -// private List getRevelAndAlphaMissenseScores(MongoDBCollection mongoDBCollection, String transcriptId, -// Integer position, String aa) { -// List scoreList = new ArrayList<>(); -// -// if (position == null || StringUtils.isEmpty(aa)) { -// return scoreList; -// } -// -// // Query for documents with source field (REVEL, ALPHAMISSENSE) -// Bson filter = Filters.and( -// Filters.eq("transcriptId", transcriptId), -// Filters.eq("aaPosition", position), -// Filters.in("source", "revel", "alphamissense") -// ); -// -// try { -// CellBaseDataResult documents = new CellBaseDataResult<>( -// mongoDBCollection.find(filter, new QueryOptions()) -// ); -// -// if (documents != null && !documents.getResults().isEmpty()) { -// for (Document document : documents.getResults()) { -// String source = (String) document.get("source"); -// List scoresArray = document.getList("scores", Document.class); -// -// if (scoresArray != null) { -// for (Document scoreDoc : scoresArray) { -// String aaAlternate = (String) scoreDoc.get("aaAlternate"); -// -// // Check if this score matches the requested alternate AA -// if (aa.equals(aaAlternate)) { -// Object scoreValue = scoreDoc.get("score"); -// if (scoreValue != null) { -// double score = Double.parseDouble(scoreValue.toString()); -// String description = (String) scoreDoc.get("effect"); -// scoreList.add(new Score(score, source, description)); -// } -// } -// } -// } -// } -// } -// } catch (Exception e) { -// logger.debug("Error retrieving REVEL and ALPHAMISSENSE scores for transcriptId: {}, position: {}", -// transcriptId, position, e); -// } -// -// return scoreList; -// } - - private List getRevelAndAlphaMissenseScores(MongoDBCollection mongoDBCollection, String chromosome, Integer position, String transcriptId, Integer aaPosition, String aa) { Map scoreSet = new HashMap<>(); diff --git a/cellbase-lib/src/main/resources/mongodb-indexes.json b/cellbase-lib/src/main/resources/mongodb-indexes.json index 19ac4e7da..7a4c2b90a 100644 --- a/cellbase-lib/src/main/resources/mongodb-indexes.json +++ b/cellbase-lib/src/main/resources/mongodb-indexes.json @@ -186,6 +186,7 @@ {"collection": "protein_substitution_prediction", "fields": {"transcriptId": 1}, "options": {"background": true}} {"collection": "protein_substitution_prediction", "fields": {"aaPosition": 1}, "options": {"background": true}} {"collection": "protein_substitution_prediction", "fields": {"chromosome": 1, "source": 1, "aaPosition": 1, "position": 1}, "options": {"background": true}} +{"collection": "protein_substitution_prediction", "fields": {"transcriptId": 1, "size": 1}, "options": {"background": true}} {"collection": "common_polygenic_score", "fields": {"id": 1}, "options": {"background": true}} {"collection": "common_polygenic_score", "fields": {"name": 1}, "options": {"background": true}} diff --git a/cellbase-lib/src/test/resources/index/mongodb-indexes.json b/cellbase-lib/src/test/resources/index/mongodb-indexes.json index 19ac4e7da..0be7aaad6 100644 --- a/cellbase-lib/src/test/resources/index/mongodb-indexes.json +++ b/cellbase-lib/src/test/resources/index/mongodb-indexes.json @@ -185,6 +185,7 @@ {"collection": "protein_substitution_prediction", "fields": {"uniprotId": 1}, "options": {"background": true}} {"collection": "protein_substitution_prediction", "fields": {"transcriptId": 1}, "options": {"background": true}} {"collection": "protein_substitution_prediction", "fields": {"aaPosition": 1}, "options": {"background": true}} +{"collection": "protein_substitution_prediction", "fields": {"transcriptId": 1, "size": 1}, "options": {"background": true}} {"collection": "protein_substitution_prediction", "fields": {"chromosome": 1, "source": 1, "aaPosition": 1, "position": 1}, "options": {"background": true}} {"collection": "common_polygenic_score", "fields": {"id": 1}, "options": {"background": true}} From ae89628a8d77d92504439e360bc4c33feba8c248 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Tue, 17 Feb 2026 16:40:51 +0100 Subject: [PATCH 4/7] app: allow users to load polyphen/sift, revel and alphamissense scores independently, #TASK-8163 --- .../app/cli/admin/executors/LoadCommandExecutor.java | 8 ++++++-- .../opencb/cellbase/lib/loader/MongoDBCellBaseLoader.java | 8 ++++---- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java index 6ed95ec54..84d685b8f 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java @@ -355,6 +355,10 @@ private void loadConservation() throws IOException, CellBaseException { private void loadProteinFunctionalPrediction() throws NoSuchMethodException, InterruptedException, ExecutionException, InstantiationException, IllegalAccessException, InvocationTargetException, ClassNotFoundException, IOException, CellBaseException, LoaderException { + // Check if SIFT/POLYPHEN source has already been loaded + checkSourceAlreadyLoaded(SIFT_DATA); + checkSourceAlreadyLoaded(POLYPHEN_DATA); + loadData(input.resolve(PROTEIN_SUBSTITUTION_PREDICTION_DATA).resolve(PROTEIN_FUNCTIONAL_PREDICTION_DATA), PROTEIN_SUBSTITUTION_PREDICTION_DATA, "prot_func_pred_"); } @@ -364,7 +368,7 @@ private void loadRevel() throws CellBaseException { checkSourceAlreadyLoaded(REVEL_DATA); HashMap collectionMap = new HashMap<>(); - collectionMap.put(MISSENSE_VARIATION_SCORE_DATA, REVEL_DATA + JSON_GZ_EXTENSION); + collectionMap.put(PROTEIN_SUBSTITUTION_PREDICTION_DATA, REVEL_DATA + JSON_GZ_EXTENSION); Path revelPath = input.resolve(PROTEIN_SUBSTITUTION_PREDICTION_DATA).resolve(REVEL_DATA); loadData(revelPath, collectionMap); @@ -375,7 +379,7 @@ private void loadAlphaMissense() throws CellBaseException { checkSourceAlreadyLoaded(ALPHAMISSENSE_DATA); HashMap collectionMap = new HashMap<>(); - collectionMap.put(MISSENSE_VARIATION_SCORE_DATA, ALPHAMISSENSE_DATA + JSON_GZ_EXTENSION); + collectionMap.put(PROTEIN_SUBSTITUTION_PREDICTION_DATA, ALPHAMISSENSE_DATA + JSON_GZ_EXTENSION); Path alphaMissensePath = input.resolve(PROTEIN_SUBSTITUTION_PREDICTION_DATA).resolve(ALPHAMISSENSE_DATA); loadData(alphaMissensePath, collectionMap); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/loader/MongoDBCellBaseLoader.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/loader/MongoDBCellBaseLoader.java index 062987922..b08806941 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/loader/MongoDBCellBaseLoader.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/loader/MongoDBCellBaseLoader.java @@ -51,7 +51,7 @@ import java.util.concurrent.BlockingQueue; import java.util.stream.Collectors; -import static org.opencb.cellbase.lib.EtlCommons.MISSENSE_VARIATION_SCORE_DATA; +import static org.opencb.cellbase.lib.EtlCommons.PROTEIN_SUBSTITUTION_PREDICTION_DATA; /** * Created by parce on 18/02/15. @@ -140,10 +140,10 @@ private String getCollectionName() throws LoaderException { + " are: " + StringUtils.join(releases, ",")); } - // Sanity check don't populate collections already populated - // Missense variation score data (i.e., revel and alphaMissense) is checked later, since revel and alphamissense are loaded + // Sanity check don't populate collections already populated, one exception: + // Protein substitution prediction data (i.e., polyphen, sift, revel and alphaMissense) is checked later, since they are loaded // in the same collection but independently - if (!data.equalsIgnoreCase(MISSENSE_VARIATION_SCORE_DATA)) { + if (!data.equalsIgnoreCase(PROTEIN_SUBSTITUTION_PREDICTION_DATA)) { for (Release dr : result.getResults()) { if (dr.getRelease() == dataRelease) { if (dr.getCollections().containsKey(data) && dr.getCollections().get(data).equals(collectionName)) { From ba8c6e3e7ee082e421cd7eb63e58836cfb0b8e01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 19 Feb 2026 08:36:33 +0100 Subject: [PATCH 5/7] Update .gitignore, #TASK-8163 --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index c99b547bc..b11375e36 100644 --- a/.gitignore +++ b/.gitignore @@ -85,3 +85,4 @@ download_log.json +.claude From fa2b8faa1dc367e6137cc6f71dd5dc76e430e255 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 19 Feb 2026 08:37:20 +0100 Subject: [PATCH 6/7] app: minor changes in cellbase-builder Dockerfile --- cellbase-app/app/cloud/docker/cellbase-builder/Dockerfile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cellbase-app/app/cloud/docker/cellbase-builder/Dockerfile b/cellbase-app/app/cloud/docker/cellbase-builder/Dockerfile index bcb2de9cb..19fbd31f9 100644 --- a/cellbase-app/app/cloud/docker/cellbase-builder/Dockerfile +++ b/cellbase-app/app/cloud/docker/cellbase-builder/Dockerfile @@ -1,5 +1,5 @@ -ARG TAG -FROM opencb/cellbase-base:$TAG +ARG TAG=latest +FROM opencb/cellbase-base:${TAG} LABEL org.label-schema.vendor="OpenCB" \ org.label-schema.name="cellbase-builder" \ @@ -32,4 +32,4 @@ RUN cd /opt/ensembl && \ ## Give writting permissions to allow the script ensembl_canonical.pl to create sub-folder for cache purposes RUN chmod -R 777 /opt/cellbase/scripts/ensembl-scripts/ -ENV PERL5LIB=$PERL5LIB:/opt/ensembl/bioperl-live:/opt/ensembl/ensembl/modules:/opt/ensembl/ensembl-variation/modules:/opt/ensembl/ensembl-funcgen/modules:/opt/ensembl/ensembl-compara/modules:/opt/ensembl/lib/perl/5.18.2:/opt/cellbase/scripts/ensembl-scripts:/opt/ensembl/biomart-perl/lib +ENV PERL5LIB=/opt/ensembl/bioperl-live:/opt/ensembl/ensembl/modules:/opt/ensembl/ensembl-variation/modules:/opt/ensembl/ensembl-funcgen/modules:/opt/ensembl/ensembl-compara/modules:/opt/ensembl/lib/perl/5.18.2:/opt/cellbase/scripts/ensembl-scripts:/opt/ensembl/biomart-perl/lib From f5e6f2332e84d9781f3c5887d7312f7d3cedd641 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 19 Feb 2026 08:37:50 +0100 Subject: [PATCH 7/7] app: improve error messages in Python script: protein_function_prediction_matrices.pl, #TASK-8163 --- .../protein_function_prediction_matrices.pl | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cellbase-app/app/scripts/ensembl-scripts/protein_function_prediction_matrices.pl b/cellbase-app/app/scripts/ensembl-scripts/protein_function_prediction_matrices.pl index 3b7939fa9..ec8c93549 100755 --- a/cellbase-app/app/scripts/ensembl-scripts/protein_function_prediction_matrices.pl +++ b/cellbase-app/app/scripts/ensembl-scripts/protein_function_prediction_matrices.pl @@ -69,7 +69,7 @@ print "Generating the JSON file for the Sift version.\n"; $jsonVersion->{"id"} = "sift"; $jsonVersion->{"name"} = "Sift"; -open(FILE, ">".$outdir."/siftVersion.json") || die "error opening file\n"; +open(FILE, ">".$outdir."/siftVersion.json") || die "error opening file [$outdir/siftVersion.json]: $!\n"; print FILE to_json($jsonVersion) . "\n"; close(FILE); @@ -77,7 +77,7 @@ print "Generating the JSON file for the PolyPhen version\n"; $jsonVersion->{"id"} = "polyphen"; $jsonVersion->{"name"} = "PolyPhen"; -open(FILE, ">".$outdir."/polyphenVersion.json") || die "error opening file\n"; +open(FILE, ">".$outdir."/polyphenVersion.json") || die "error opening file [$outdir/polyphenVersion.json]: $!\n"; print FILE to_json($jsonVersion) . "\n"; close(FILE); @@ -158,7 +158,8 @@ #my @all_chroms = @{$slice_adaptor->fetch_all('chromosome')}; foreach my $chr(@chromosomes) { my @transcripts = @{$chr->get_all_Transcripts()}; - open(FILE, ">".$outdir."/prot_func_pred_chr_".$chr->seq_region_name.".json") || die "error opening file\n"; + my $filename = $outdir."/prot_func_pred_chr_".$chr->seq_region_name.".json"; + open(FILE, ">".$filename) || die "error opening file [$filename]: $!\n"; print @transcripts." transcripts fetched!\n"; foreach my $trans(@transcripts) { if($trans->biotype eq 'protein_coding') {