GENERanno
Collection
3 items
•
Updated
•
1
In this repository, we present GENERanno-cds-annotator, which is meticulously finetuned on GENERanno-prokaryote-0.5b-base for metagenomic annotation tasks. Through comprehensive evaluations, GENERanno-cds-annotator achieves superior accuracy compared to traditional HMM-based methods (e.g., GLIMMER3, GeneMarkS2, Prodigal) and recent LLM-based approaches (e.g., GeneLM), while demonstrating exceptional generalization ability on archaeal genomes. The detailed annotation results are provided here.
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
# Load the tokenizer and model
model_name = "GenerTeam/GENERanno-prokaryote-0.5b-cds-annotator"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForTokenClassification.from_pretrained(model_name, trust_remote_code=True)
model.eval() # Set the model to evaluation mode
# Prepare the input sequence. Let's use a sample sequence.
sequence = "ATGAGGTGGCAAGAAATGGGCTACGAATTCCATGAGGCTATAGAATAATCTAAGAGAAAT"
# Tokenize the sequence. It's crucial to use `add_special_tokens=False`.
inputs = tokenizer(sequence, add_special_tokens=False, return_tensors="pt")
input_ids = inputs["input_ids"]
# The number of tokens should be equal to the length of the sequence.
sequence_length = len(sequence)
assert sequence_length == input_ids.shape[1]
with torch.inference_mode():
logits = model(**inputs).logits
raw_predictions = logits.argmax(dim=-1).cpu()
# Post-process the predictions
# This model features multiple prediction heads (for the positive and negative strands).
# The predictions for all heads are concatenated into a single output tensor.
# Get the model's configuration for processing the output
id2label = model.config.id2label
num_heads = getattr(model, "num_prediction_heads", 2) # Defaults to 2 if not specified
# Define the mapping from model labels to annotation characters:
# "CDS" -> "+" (Indicates a Coding DNA Sequence)
# "NON_CODING" -> "-" (Indicates a non-coding region)
label_to_char = {"CDS": "+", "NON_CODING": "-"}
print(f"Model has {num_heads} prediction head(s). Processing results...")
# The `raw_predictions` tensor has a shape of (batch_size, sequence_length * num_heads).
# We need to de-concatenate the predictions for each head.
all_head_annotations = []
preds_for_sequence = raw_predictions[0] # Get predictions for the first sequence in the batch
for h in range(num_heads):
# Extract the slice of predictions corresponding to the current head
start_idx = h * sequence_length
end_idx = (h + 1) * sequence_length
head_preds_ids = preds_for_sequence[start_idx:end_idx]
# Map the numeric prediction IDs to their string labels (e.g., 1 -> 'CDS')
head_preds_labels = [id2label[pred_id.item()] for pred_id in head_preds_ids]
# Convert the string labels into the final annotation string (e.g., '+'/'-')
annotation_string = "".join([label_to_char[label] for label in head_preds_labels])
all_head_annotations.append(annotation_string)
# Display the final annotations
# For this model, the two heads correspond to the positive and negative DNA strands.
head_names = ["Positive Strand", "Negative Strand"] if num_heads == 2 else [f"Head {i+1}" for i in range(num_heads)]
print("\n--- Annotation Results ---")
print(f"Input Sequence: {sequence}")
for i, annotation in enumerate(all_head_annotations):
print(f"Annotation ({head_names[i]}): {annotation}")
print("--------------------------\n")
# How to interpret the output:
# - A '+' at a position for the "Positive Strand" annotation means the model predicts that base
# is part of a coding sequence on the forward (5' to 3') strand.
# - A '+' at a position for the "Negative Strand" annotation means the model predicts that base
# is part of a coding sequence on the reverse complementary strand.
# - A '-' indicates a non-coding region for that respective strand.
@article{li2025generanno,
author = {Li, Qiuyi and Wu, Wei and Zhu, Yiheng and Feng, Fuli and Ye, Jieping and Wang, Zheng},
title = {GENERanno: A Genomic Foundation Model for Metagenomic Annotation},
elocation-id = {2025.06.04.656517},
year = {2025},
doi = {10.1101/2025.06.04.656517},
publisher = {Cold Spring Harbor Laboratory},
URL = {https://www.biorxiv.org/content/early/2025/06/05/2025.06.04.656517},
journal = {bioRxiv}
}
Base model
GenerTeam/GENERanno-prokaryote-0.5b-base