tinystories-llama-15m / inference_example.py
sdobson's picture
Upload 9 files
d171a71 verified
raw
history blame
1.56 kB
"""
Example script for running inference with the Hugging Face model.
"""
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import warnings
# Suppress the TypedStorage deprecation warning
warnings.filterwarnings('ignore', category=UserWarning, message='.*TypedStorage is deprecated.*')
def main():
model_path = "." # Path to the model, or use "YOUR_USERNAME/YOUR_MODEL_NAME" for HF Hub
print("Loading model and tokenizer...")
model = AutoModelForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
model.eval()
print(f"Model loaded on {device}")
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
prompts = [
"Once upon a time",
"The quick brown fox",
]
for prompt in prompts:
print(f"\n{'='*60}")
print(f"Prompt: {prompt}")
print(f"{'='*60}")
inputs = tokenizer(prompt, return_tensors="pt").to(device)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_length=100,
temperature=1.0,
top_k=50,
top_p=0.9,
do_sample=True,
pad_token_id=tokenizer.eos_token_id
)
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"\nGenerated:\n{generated_text}")
if __name__ == "__main__":
main()