# Introducing Isambard-AI: Interactive Chatbot tutorial
## Chat to Isambot (5 min)

In [None]:
MODEL_ID = "microsoft/Phi-3-mini-128k-instruct"
CACHE_DIR = "/projects/public/brics/cache"

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
torch.random.manual_seed(0)

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID, 
    cache_dir=CACHE_DIR,
    torch_dtype="auto", 
    trust_remote_code=True, 
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)


In [None]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

In [None]:
def isambot(pipe):
    SYSTEM_PROMPT = "You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user."
    PIPELINE_KWARGS = {
        "max_new_tokens": 500,
        "return_full_text": False,
    }

    print("ðŸŽ© IsamBot chat")
    print("Type 'exit' to end the chat.")

    chat = [{"role": "system", "content": SYSTEM_PROMPT}]
    while True:
        # Get interactive user prompt and append to chat history
        user_input = input("?> ")
        if user_input.lower() == "exit":
            print("Exiting IsamBot chat. Goodbye!")
            break

        chat.append({"role": "user", "content": user_input})

        try:
            # Generate response
            response = pipe(chat, **PIPELINE_KWARGS)
            assert len(response) == 1, "Expected a single response item"
        except Exception as e:
            print(f"An error occurred: {e}")
            continue

        # Output response
        print(f"ðŸŽ©> {response[0]['generated_text']}\n")

        # Append response to chat history
        chat.append({"role": "assistant", "content": response[0]["generated_text"]})

In [None]:
isambot(pipe)

## Measuring Token Output Rate (5 min)

In [None]:
import time

def measure_performance(prompt, model, tokenizer, device, max_new_tokens=50):
    # Tokenize input prompt
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # Start the timer
    start_time = time.time()

    # Generate output tokens
    outputs = model.generate(inputs["input_ids"], max_length=max_new_tokens)

    # End the timer
    end_time = time.time()

    # Calculate the number of tokens generated
    num_tokens = outputs.shape[-1]

    # Calculate tokens per second
    elapsed_time = end_time - start_time
    tokens_per_second = num_tokens / elapsed_time

    print(f"Generated {num_tokens} tokens in {elapsed_time:.4f} seconds")
    print(f"Performance: {tokens_per_second:.2f} tokens per second")

    return tokens_per_second

In [None]:
prompt = "This is a test to measure the model's performance in generating tokens."
tokens_per_second_cpu = measure_performance(prompt, model, tokenizer, device="cpu", max_new_tokens=50)

## Hardware Monitoring (5 min)

In [None]:
tokens_per_second_cpu = measure_performance(prompt, model, tokenizer, device="cpu", max_new_tokens=50)

In [None]:
isambot(pipe)

## Mapping the model to a device (5 min)

In [None]:
DEVICE_MAP = "cuda:0"
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID, 
    device_map=DEVICE_MAP, 
    cache_dir=CACHE_DIR,
    torch_dtype="auto", 
    trust_remote_code=True, 
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

## Break the Model (10 min)

In [None]:
isambot(pipe)

In [None]:
prompt = "This is a test to measure the model's performance in generating tokens."
tokens_per_second_cpu = measure_performance(prompt, model, tokenizer, device=DEVICE_MAP, max_new_tokens=50)

## Running a Bigger Model (Bonus round)

In [None]:
MODEL_ID = "nvidia/Llama3-ChatQA-1.5-70B"