anemll
/

anemll-DeepSeek_ctx1024_iOS.0.1.2

llama

Model card Files Files and versions

xet

Community

anemll commited on Mar 1

Commit

86baea7

verified ·

1 Parent(s): f056d10

Fixed GIL issue

Browse files

Race condition between CoreML and casual_mask update

Files changed (1) hide show

chat.py +323 -204

chat.py CHANGED Viewed

@@ -26,8 +26,10 @@ DARK_BLUE = "\033[34m"
 LIGHT_GREEN = "\033[92m"
 RESET_COLOR = "\033[0m"
-# Add at top with other constants
 WARMUP_TOKEN_LIMIT = 10  # Maximum tokens to generate during warmup
 class TokenPrinter:
     """Handles background printing of generated tokens."""
@@ -40,9 +42,12 @@ class TokenPrinter:
         self.lock = threading.Lock()
         self.thinking = True  # Track if we're still in thinking mode
         self.decoding_buffer = []  # Buffer for token IDs
-        # Add token counting and timing
         self.start_time = time.time()
         self.token_count = 0
         self.start()
     def start(self):
@@ -103,15 +108,15 @@ class TokenPrinter:
                 self.thread.join(timeout=1.0)
             except Exception:
                 pass
-            # Calculate and print tokens/s with shorter format in blue
-            elapsed = time.time() - self.start_time
-            if elapsed > 0 and self.token_count > 0:
-                tokens_per_sec = self.token_count / elapsed
-                print(f"\n{DARK_BLUE}{tokens_per_sec:.1f} t/s{RESET_COLOR}")
-            else:
-                print(RESET_COLOR)  # Reset color at the end
         return self.buffer
 def parse_model_path(path):
     """Parse model path and return full path with .mlmodelc or .mlpackage extension."""
     path = Path(path)
@@ -188,6 +193,89 @@ def load_model(path, function_name=None):
             print("\nTry using the .mlpackage version instead, or recompile the model.")
         raise
 def load_metadata(model,args):
     # Extract metadata and config parameters
     metadata = {}
@@ -386,102 +474,99 @@ def make_causal_mask(length, start):
     mask[:, :, col_indices <= (row_indices + start)] = 0
     return mask
-def run_prefill(embed_model, ffn_models, input_ids, context_pos, context_length, batch_size=64, state=None):
     """Run prefill on the input sequence."""
-    # Create causal mask
-    causal_mask = make_causal_mask(context_length, 0)
-    causal_mask = torch.tensor(causal_mask, dtype=torch.float16)
     # Process in batches
     batch_pos = 0
-    while batch_pos < context_pos:
-        batch_end = min(batch_pos + batch_size, context_pos)
         current_batch_size = batch_end - batch_pos
         # Get current batch
         batch_input = input_ids[:, batch_pos:batch_end]
-        # Always pad to full batch size for prefill
         batch_input = F.pad(
             batch_input,
             (0, batch_size - current_batch_size),
             value=0
         )
-        # Generate position IDs for full batch size
-        position_ids = torch.arange(batch_size, dtype=torch.int32)  # Changed: Always use full batch size
-        batch_causal_mask = causal_mask[:, :, :batch_size, :]  # Changed: Use full batch size
         # Run embeddings
         hidden_states = torch.from_numpy(
             embed_model.predict({'input_ids': batch_input.numpy()})['hidden_states']
         )
-        # Run through FFN chunks with state
         for ffn_model in ffn_models:
             if isinstance(ffn_model, dict):
                 inputs = {
-                    'hidden_states': hidden_states.numpy(),  # [1, 64, hidden_size]
-                    'position_ids': position_ids.numpy(),    # [64]
-                    'causal_mask': batch_causal_mask.numpy(), # [1, 1, 64, context_length]
-                    'current_pos': np.array([batch_pos], dtype=np.int32)  # [1]
                 }
                 output = ffn_model['prefill'].predict(inputs, state)
                 hidden_states = torch.from_numpy(output['output_hidden_states'])
         batch_pos = batch_end
-    return torch.tensor([context_pos], dtype=torch.int32)
-def generate_next_token(embed_model, ffn_models, lmhead_model, input_ids, pos, context_length, state=None, temperature=0.0):
     """Generate the next token."""
     # Get current token
-    current_token = input_ids[:, pos-1:pos]  # [1, 1]
     # Run embeddings
     hidden_states = torch.from_numpy(
         embed_model.predict({'input_ids': current_token.numpy()})['hidden_states']
-    )  # [1, 1, hidden_size]
     # Create masks
     update_mask = torch.zeros((1, 1, context_length, 1), dtype=torch.float16)
     update_mask[0, 0, pos-1, 0] = 1.0
-    position_ids = torch.tensor([pos-1], dtype=torch.int32)  # [1]
-    causal_mask = make_causal_mask(context_length, 0)
-    causal_mask = torch.tensor(causal_mask[:, :, pos-1:pos, :], dtype=torch.float16)  # [1, 1, 1, context_length]
-    # Run through FFN chunks with state
     for ffn_model in ffn_models:
         if isinstance(ffn_model, dict):
             inputs = {
                 'hidden_states': hidden_states.numpy(),
                 'update_mask': update_mask.numpy(),
                 'position_ids': position_ids.numpy(),
-                'causal_mask': causal_mask.numpy(),
                 'current_pos': position_ids.numpy()
             }
             output = ffn_model['infer'].predict(inputs, state)
             hidden_states = torch.from_numpy(output['output_hidden_states'])
-    # Run LM head
     lm_output = lmhead_model.predict({'hidden_states': hidden_states.numpy()})
-    # Debug print
-    #print("\nLM Head output keys:", list(lm_output.keys()))
-    # Combine logits1-8 if they exist
     if 'logits1' in lm_output:
-        # Concatenate all logits parts
         logits_parts = []
         for i in range(1, 9):
             key = f'logits{i}'
             if key in lm_output:
                 logits_parts.append(torch.from_numpy(lm_output[key]))
-        logits = torch.cat(logits_parts, dim=-1)  # Concatenate along vocab dimension
     else:
-        # Try output_logits as fallback
         logits = torch.from_numpy(lm_output['output_logits'])
-    # Apply temperature and sample
     if temperature > 0:
         logits = logits / temperature
         probs = F.softmax(logits[0, -1, :], dim=-1)
@@ -503,36 +588,93 @@ def create_unified_state(ffn_models, context_length):
         print("\nCreated unified transformer state")
         return state
-def chat_loop(embed_model, ffn_models, lmhead_model, tokenizer, metadata, state, auto_prompt=None, warmup=False):
     """Interactive chat loop."""
     context_length = metadata.get('context_length')
     batch_size = metadata.get('batch_size', 64)
     if not warmup:
         print(f"\nUsing context length: {context_length}")
         print("\nStarting chat session. Press Ctrl+D to exit.")
-        print("Type your message and press Enter to chat.")
-    # Check if tokenizer has chat template and if it works
-    has_chat_template = False
-    try:
-        # Test if chat template works
-        test_messages = [{"role": "user", "content": "test"}]
-        tokenizer.apply_chat_template(test_messages, return_tensors="pt")
-        has_chat_template = True
-        if not warmup:
-            print("\nUsing chat template for prompts")
-    except:
-        if not warmup:
-            print("\nUsing manual formatting for prompts")
     conversation = []
     try:
         while True:
             try:
                 if not warmup:
-                    print(f"\n{LIGHT_GREEN}You:{RESET_COLOR}", end=' ', flush=True)
                 if auto_prompt is not None:
                     user_input = auto_prompt
                     if not warmup:
@@ -543,41 +685,69 @@ def chat_loop(embed_model, ffn_models, lmhead_model, tokenizer, metadata, state,
                 if not warmup:
                     print("\nExiting chat...")
                 break
             if not user_input:
                 continue
-            # Format prompt based on tokenizer capabilities
-            if has_chat_template:
-                messages = [{"role": "user", "content": user_input}]
-                input_ids = tokenizer.apply_chat_template(
-                    messages,
                     return_tensors="pt",
                     add_generation_prompt=True
                 ).to(torch.int32)
             else:
-                # Manual formatting for Llama models without chat template
-                formatted_prompt = f"[INST] {user_input} [/INST]"
-                input_ids = tokenizer(
-                    formatted_prompt,
                     return_tensors="pt",
-                    add_special_tokens=True
-                ).input_ids.to(torch.int32)
-            context_pos = input_ids.size(1)
             if not warmup:
                 print(f"\n{LIGHT_BLUE}Assistant:{RESET_COLOR}", end=' ', flush=True)
-            # Initialize token printer
             token_printer = TokenPrinter(tokenizer)
-            tokens_generated = 0  # Track number of tokens
             try:
-                # Start prefill timing
-                prefill_start = time.time()
-                # Run prefill with state
                 current_pos = run_prefill(
                     embed_model,
                     ffn_models,
@@ -585,21 +755,53 @@ def chat_loop(embed_model, ffn_models, lmhead_model, tokenizer, metadata, state,
                     context_pos,
                     context_length,
                     batch_size,
-                    state
                 )
-                # Calculate prefill timing
-                prefill_time = time.time() - prefill_start
-                prefill_tokens = context_pos  # Number of tokens in input
-                prefill_tokens_per_sec = prefill_tokens / prefill_time if prefill_time > 0 else 0
-                # Generation loop with state
-                input_ids = input_ids
                 pos = context_pos
-                inference_start = time.time()
-                inference_tokens = 0
-                while pos < context_length - 1:
                     # Generate next token
                     next_token = generate_next_token(
                         embed_model,
@@ -608,146 +810,58 @@ def chat_loop(embed_model, ffn_models, lmhead_model, tokenizer, metadata, state,
                         input_ids,
                         pos,
                         context_length,
-                        state
                     )
-                    # Add token to sequence
-                    if pos < input_ids.size(1):
-                        input_ids[0, pos] = next_token
-                    else:
-                        input_ids = torch.cat([
-                            input_ids,
-                            torch.tensor([[next_token]], dtype=torch.int32)
-                        ], dim=1)
-                    # Add to printer only if not in warmup
                     if not warmup:
                         token_printer.add_token(next_token)
                         token_printer.drain_buffer()
                     pos += 1
                     tokens_generated += 1
-                    inference_tokens += 1
-                    # Check limits
                     if warmup and tokens_generated >= WARMUP_TOKEN_LIMIT:
                         break
                     if next_token == tokenizer.eos_token_id:
                         break
-                # Calculate inference timing
-                inference_time = time.time() - inference_start
-                inference_tokens_per_sec = inference_tokens / inference_time if inference_time > 0 else 0
-                # Get final response and add to conversation
                 if not warmup:
-                    response = token_printer.stop()
-                    # Print timing stats
-                    prefill_ms = prefill_time * 1000  # Convert to milliseconds
-                    print(f"\nPrefill: {prefill_ms:.1f}ms ({prefill_tokens_per_sec:.1f} t/s)")
-                    print(f"Inference: {inference_tokens_per_sec:.1f} t/s")
-                    print(f"Total: Generated {tokens_generated} tokens in {prefill_time + inference_time:.2f}s")
-                    conversation.append({"role": "assistant", "content": response})
-                else:
-                    token_printer.stop()  # Clean up without printing stats
-                # Exit after one response in auto_prompt mode
                 if auto_prompt is not None:
                     break
             except KeyboardInterrupt:
-                print("\nGeneration interrupted")
                 token_printer.stop()
                 continue
     except Exception as e:
-        print(f"\nError in chat loop: {str(e)}")
-        import traceback
-        traceback.print_exc()
-def parse_args():
-    parser = argparse.ArgumentParser(description='Chat with CoreML LLaMA (c) 2025 Anemll')
-    # Add meta.yaml option
-    parser.add_argument('--meta', type=str, help='Path to meta.yaml to load all parameters')
-    # Model paths
-    parser.add_argument('--d', '--dir', type=str, default='.',
-                       help='Directory containing model files (default: current directory)')
-    parser.add_argument('--embed', type=str, required=False,
-                       help='Path to embeddings model (relative to --dir)')
-    parser.add_argument('--ffn', type=str, required=False,
-                       help='Path to FFN model (can be chunked, relative to --dir)')
-    parser.add_argument('--lmhead', type=str, required=False,
-                       help='Path to LM head model (relative to --dir)')
-    parser.add_argument('--tokenizer', type=str, required=False,
-                       help='Path to tokenizer')
-    # Add new argument for auto-generation
-    parser.add_argument('--prompt', type=str,
-                       help='If specified, run once with this prompt and exit')
-    # Add no-warmup flag
-    parser.add_argument('--nw', action='store_true',
-                       help='Skip warmup phase')
-    # Model configuration
-    parser.add_argument('--context-length', type=int,
-                       help='Context length for the model (default: 512), if not provided, it will be detected from the model directory name ctxNUMBER')
-    parser.add_argument('--batch-size', type=int,
-                       help='Batch size for prefill (default: 64)')
-    args = parser.parse_args()
-    # If meta.yaml is provided, load parameters from it
-    if args.meta:
-        try:
-            with open(args.meta, 'r') as f:
-                meta = yaml.safe_load(f)
-            params = meta['model_info']['parameters']
-            # Set model directory to meta.yaml directory if not specified
-            if not args.d or args.d == '.':
-                args.d = str(Path(args.meta).parent)
-            # Build model paths based on parameters
-            prefix = params.get('model_prefix', 'llama')  # Default to 'llama' if not specified
-            lut_ffn = f"_lut{params['lut_ffn']}" if params['lut_ffn'] != 'none' else ''
-            lut_lmhead = f"_lut{params['lut_lmhead']}" if params['lut_lmhead'] != 'none' else ''
-            num_chunks = int(params['num_chunks'])
-            # Set model paths if not specified
-            if not args.embed:
-                args.embed = f'{prefix}_embeddings'
-            if not args.lmhead:
-                args.lmhead = f'{prefix}_lm_head{lut_lmhead}'
-            if not args.ffn:
-                args.ffn = f'{prefix}_FFN_PF{lut_ffn}_chunk_01of{num_chunks:02d}'
-            if not args.tokenizer:
-                args.tokenizer = args.d
-            # Set other parameters if not overridden by command line
-            if args.context_length is None:
-                args.context_length = int(params['context_length'])
-            if args.batch_size is None:
-                args.batch_size = int(params['batch_size'])
-            args.num_chunks = num_chunks
-            print(f"\nLoaded parameters from {args.meta}:")
-            print(f"  Context Length: {args.context_length}")
-            print(f"  Batch Size: {args.batch_size}")
-            print(f"  Num Chunks: {args.num_chunks}")
-            print(f"  Models Directory: {args.d}")
-            print(f"  Embeddings: {args.embed}")
-            print(f"  LM Head: {args.lmhead}")
-            print(f"  FFN: {args.ffn}")
-        except Exception as e:
-            print(f"\nError loading meta.yaml: {str(e)}")
-            sys.exit(1)
-    return args
 def main():
     args = parse_args()
@@ -800,6 +914,9 @@ def main():
         # Create unified state once
         state = create_unified_state(ffn_models, metadata['context_length'])
         # Warmup runs to prevent Python GIL issues with CoreML !
         if not args.nw:
             for i in range(2):
@@ -809,7 +926,8 @@ def main():
                     lmhead_model=lmhead_model,
                     tokenizer=tokenizer,
                     metadata=metadata,
-                    state=state,
                     warmup=True,
                     auto_prompt="who are you?"
                 )
@@ -821,7 +939,8 @@ def main():
             lmhead_model=lmhead_model,
             tokenizer=tokenizer,
             metadata=metadata,
-            state=state,
             warmup=False,
             auto_prompt=args.prompt
         )

 LIGHT_GREEN = "\033[92m"
 RESET_COLOR = "\033[0m"
+# Add at the top with other constants
 WARMUP_TOKEN_LIMIT = 10  # Maximum tokens to generate during warmup
+THINKING_MODE = False
+THINKING_PROMPT = """You are a deep thinking AI, you may use extremely long chains of thought to deeply consider the problem and deliberate with yourself via systematic reasoning processes to help come to a correct solution prior to answering. You should enclose your thoughts and internal monologue inside <think> </think> tags, and then provide your solution or response to the problem."""
 class TokenPrinter:
     """Handles background printing of generated tokens."""
         self.lock = threading.Lock()
         self.thinking = True  # Track if we're still in thinking mode
         self.decoding_buffer = []  # Buffer for token IDs
+        # Timing and stats tracking
         self.start_time = time.time()
         self.token_count = 0
+        self.prefill_time = 0
+        self.inference_time = 0
+        self.context_pos = 0
         self.start()
     def start(self):
                 self.thread.join(timeout=1.0)
             except Exception:
                 pass
+            print(RESET_COLOR)  # Reset color at the end
         return self.buffer
+    def set_timing(self, prefill_time, inference_time, context_pos):
+        """Set timing information."""
+        self.prefill_time = prefill_time
+        self.inference_time = inference_time
+        self.context_pos = context_pos
 def parse_model_path(path):
     """Parse model path and return full path with .mlmodelc or .mlpackage extension."""
     path = Path(path)
             print("\nTry using the .mlpackage version instead, or recompile the model.")
         raise
+def parse_args():
+    parser = argparse.ArgumentParser(description='Full Chat with CoreML LLaMA with context window shifting, gil resolved (c) 2025 Anemll')
+    # Add meta.yaml option
+    parser.add_argument('--meta', type=str, help='Path to meta.yaml to load all parameters')
+    # Add existing arguments
+    parser.add_argument('--d', '--dir', type=str, default='.',
+                       help='Directory containing model files (default: current directory)')
+    parser.add_argument('--embed', type=str, required=False,
+                       help='Path to embeddings model (relative to --dir)')
+    parser.add_argument('--ffn', type=str, required=False,
+                       help='Path to FFN model (can be chunked, relative to --dir)')
+    parser.add_argument('--lmhead', type=str, required=False,
+                       help='Path to LM head model (relative to --dir)')
+    parser.add_argument('--tokenizer', type=str, required=False,
+                       help='Path to tokenizer')
+    # Add new argument for auto-generation
+    parser.add_argument('--prompt', type=str,
+                       help='If specified, run once with this prompt and exit')
+    # Add no-warmup flag
+    parser.add_argument('--nw', action='store_true',
+                       help='Skip warmup phase')
+    # Model configuration
+    parser.add_argument('--context-length', type=int,
+                       help='Context length for the model (default: 512), if not provided, it will be detected from the model directory name ctxNUMBER')
+    parser.add_argument('--batch-size', type=int,
+                       help='Batch size for prefill (default: 64)')
+    args = parser.parse_args()
+    # If meta.yaml is provided, load parameters from it
+    if args.meta:
+        try:
+            with open(args.meta, 'r') as f:
+                meta = yaml.safe_load(f)
+            params = meta['model_info']['parameters']
+            # Set model directory to meta.yaml directory if not specified
+            if not args.d or args.d == '.':
+                args.d = str(Path(args.meta).parent)
+            # Build model paths based on parameters
+            prefix = params.get('model_prefix', 'llama')  # Default to 'llama' if not specified
+            lut_ffn = f"_lut{params['lut_ffn']}" if params['lut_ffn'] != 'none' else ''
+            lut_lmhead = f"_lut{params['lut_lmhead']}" if params['lut_lmhead'] != 'none' else ''
+            num_chunks = int(params['num_chunks'])
+            # Set model paths if not specified
+            if not args.embed:
+                args.embed = f'{prefix}_embeddings'
+            if not args.lmhead:
+                args.lmhead = f'{prefix}_lm_head{lut_lmhead}'
+            if not args.ffn:
+                args.ffn = f'{prefix}_FFN_PF{lut_ffn}_chunk_01of{num_chunks:02d}'
+            if not args.tokenizer:
+                args.tokenizer = args.d
+            # Set other parameters if not overridden by command line
+            if args.context_length is None:
+                args.context_length = int(params['context_length'])
+            if args.batch_size is None:
+                args.batch_size = int(params['batch_size'])
+            args.num_chunks = num_chunks
+            print(f"\nLoaded parameters from {args.meta}:")
+            print(f"  Context Length: {args.context_length}")
+            print(f"  Batch Size: {args.batch_size}")
+            print(f"  Num Chunks: {args.num_chunks}")
+            print(f"  Models Directory: {args.d}")
+            print(f"  Embeddings: {args.embed}")
+            print(f"  LM Head: {args.lmhead}")
+            print(f"  FFN: {args.ffn}")
+        except Exception as e:
+            print(f"\nError loading meta.yaml: {str(e)}")
+            sys.exit(1)
+    return args
 def load_metadata(model,args):
     # Extract metadata and config parameters
     metadata = {}
     mask[:, :, col_indices <= (row_indices + start)] = 0
     return mask
+def run_prefill(embed_model, ffn_models, input_ids, current_pos, context_length, batch_size, state, causal_mask):
     """Run prefill on the input sequence."""
+    #print(f"[DEBUG] Running prefill from 0 to {current_pos}")
     # Process in batches
     batch_pos = 0
+    while batch_pos < current_pos:
+        batch_end = min(batch_pos + batch_size, current_pos)
         current_batch_size = batch_end - batch_pos
+        #print(f"[DEBUG] Prefill batch {batch_pos}-{batch_end} (size={current_batch_size})")
         # Get current batch
         batch_input = input_ids[:, batch_pos:batch_end]
+        # Pad to full batch size
         batch_input = F.pad(
             batch_input,
             (0, batch_size - current_batch_size),
             value=0
         )
+        # Generate position IDs for this batch
+        position_ids = torch.arange(batch_pos, batch_pos + batch_size, dtype=torch.int32)
+        # Use the pre-initialized causal mask and extract the batch portion
+        batch_causal_mask = causal_mask[:, :, batch_pos:batch_pos + batch_size, :]
         # Run embeddings
         hidden_states = torch.from_numpy(
             embed_model.predict({'input_ids': batch_input.numpy()})['hidden_states']
         )
+        # Run through FFN chunks
         for ffn_model in ffn_models:
             if isinstance(ffn_model, dict):
                 inputs = {
+                    'hidden_states': hidden_states.numpy(),
+                    'position_ids': position_ids.numpy(),
+                    'causal_mask': batch_causal_mask.numpy(),
+                    'current_pos': np.array([batch_pos], dtype=np.int32)
                 }
                 output = ffn_model['prefill'].predict(inputs, state)
                 hidden_states = torch.from_numpy(output['output_hidden_states'])
         batch_pos = batch_end
+    return torch.tensor([current_pos], dtype=torch.int32)
+def generate_next_token(embed_model, ffn_models, lmhead_model, input_ids, pos, context_length, state, causal_mask, temperature=0.0):
     """Generate the next token."""
     # Get current token
+    current_token = input_ids[:, pos-1:pos]
     # Run embeddings
     hidden_states = torch.from_numpy(
         embed_model.predict({'input_ids': current_token.numpy()})['hidden_states']
+    )
     # Create masks
     update_mask = torch.zeros((1, 1, context_length, 1), dtype=torch.float16)
     update_mask[0, 0, pos-1, 0] = 1.0
+    position_ids = torch.tensor([pos-1], dtype=torch.int32)
+    # Use the pre-initialized causal mask and extract the single position portion
+    single_causal_mask = causal_mask[:, :, pos-1:pos, :]
+    # Run through FFN chunks
     for ffn_model in ffn_models:
         if isinstance(ffn_model, dict):
             inputs = {
                 'hidden_states': hidden_states.numpy(),
                 'update_mask': update_mask.numpy(),
                 'position_ids': position_ids.numpy(),
+                'causal_mask': single_causal_mask.numpy(),
                 'current_pos': position_ids.numpy()
             }
             output = ffn_model['infer'].predict(inputs, state)
             hidden_states = torch.from_numpy(output['output_hidden_states'])
+    # Run LM head and get next token
     lm_output = lmhead_model.predict({'hidden_states': hidden_states.numpy()})
     if 'logits1' in lm_output:
         logits_parts = []
         for i in range(1, 9):
             key = f'logits{i}'
             if key in lm_output:
                 logits_parts.append(torch.from_numpy(lm_output[key]))
+        logits = torch.cat(logits_parts, dim=-1)
     else:
         logits = torch.from_numpy(lm_output['output_logits'])
     if temperature > 0:
         logits = logits / temperature
         probs = F.softmax(logits[0, -1, :], dim=-1)
         print("\nCreated unified transformer state")
         return state
+def initialize_causal_mask(context_length):
+    """Initialize causal mask for transformer attention."""
+    causal_mask = make_causal_mask(context_length, 0)
+    causal_mask = torch.tensor(causal_mask, dtype=torch.float16)
+    print(f"\nInitialized causal mask for context length {context_length}")
+    return causal_mask
+def get_user_input():
+    """Get input from user, handling special key combinations."""
+    global THINKING_MODE
+    try:
+        import termios
+        import tty
+        import sys
+        def _getch():
+            fd = sys.stdin.fileno()
+            old_settings = termios.tcgetattr(fd)
+            try:
+                tty.setraw(sys.stdin.fileno())
+                ch = sys.stdin.read(1)
+            finally:
+                termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
+            return ch
+        buffer = []
+        while True:
+            char = _getch()
+            # Debug: print the character code
+            print(f"\nKey pressed: {repr(char)} (hex: {hex(ord(char))})")
+            # Check for Enter key
+            if char == '\r' or char == '\n':
+                print()  # Move to next line
+                input_text = ''.join(buffer)
+                # Check if the command is /t
+                if input_text == '/t':
+                    THINKING_MODE = not THINKING_MODE
+                    print(f"Thinking mode {'ON' if THINKING_MODE else 'OFF'}")
+                    buffer = []  # Clear buffer
+                    print(f"\n{LIGHT_GREEN}You{' (thinking)' if THINKING_MODE else ''}:{RESET_COLOR}", end=' ', flush=True)
+                    continue
+                return input_text
+            # Handle backspace
+            if char == '\x7f':  # backspace
+                if buffer:
+                    buffer.pop()
+                    sys.stdout.write('\b \b')  # Erase character
+                    sys.stdout.flush()
+                continue
+            # Handle Ctrl-C
+            if char == '\x03':  # Ctrl-C
+                print("^C")
+                raise KeyboardInterrupt
+            # Print character and add to buffer
+            sys.stdout.write(char)
+            sys.stdout.flush()
+            buffer.append(char)
+    except ImportError:
+        # Fallback for systems without termios
+        return input("> ")
+def chat_loop(embed_model, ffn_models, lmhead_model, tokenizer, metadata, state, causal_mask, auto_prompt=None, warmup=False):
     """Interactive chat loop."""
+    global THINKING_MODE
     context_length = metadata.get('context_length')
     batch_size = metadata.get('batch_size', 64)
     if not warmup:
         print(f"\nUsing context length: {context_length}")
         print("\nStarting chat session. Press Ctrl+D to exit.")
+        print("Type your message and press Enter to chat. Use /t to toggle thinking mode.")
+        print(f"Thinking mode is {'ON' if THINKING_MODE else 'OFF'}")
+    # Keep track of conversation history
     conversation = []
     try:
         while True:
             try:
                 if not warmup:
+                    print(f"\n{LIGHT_GREEN}You{' (thinking)' if THINKING_MODE else ''}:{RESET_COLOR}", end=' ', flush=True)
                 if auto_prompt is not None:
                     user_input = auto_prompt
                     if not warmup:
                 if not warmup:
                     print("\nExiting chat...")
                 break
             if not user_input:
                 continue
+            # Handle /t command
+            if user_input == "/t":
+                THINKING_MODE = not THINKING_MODE
+                print(f"Thinking mode {'ON' if THINKING_MODE else 'OFF'}")
+                continue
+            # Add user message to conversation
+            conversation.append({"role": "user", "content": user_input})
+            # Format using chat template with full history
+            if THINKING_MODE:
+                # Add thinking prompt to system message
+                conversation_with_thinking = [{"role": "system", "content": THINKING_PROMPT}] + conversation
+                base_input_ids = tokenizer.apply_chat_template(
+                    conversation_with_thinking,
                     return_tensors="pt",
                     add_generation_prompt=True
                 ).to(torch.int32)
             else:
+                base_input_ids = tokenizer.apply_chat_template(
+                    conversation,
                     return_tensors="pt",
+                    add_generation_prompt=True
+                ).to(torch.int32)
+            # Check if we need to trim history
+            while base_input_ids.size(1) > context_length - 100:  # Leave room for response
+                # Remove oldest message pair (user + assistant)
+                if len(conversation) > 2:
+                    conversation = conversation[2:]  # Remove oldest pair
+                    base_input_ids = tokenizer.apply_chat_template(
+                        conversation,
+                        return_tensors="pt",
+                        add_generation_prompt=True
+                    ).to(torch.int32)
+                else:
+                    # If only current message remains and still too long, truncate
+                    base_input_ids = base_input_ids[:, -context_length//2:]
+                    break
+            context_pos = base_input_ids.size(1)
+            # Pad sequence to context_size
+            input_ids = F.pad(
+                base_input_ids,
+                (0, context_length - context_pos),
+                value=0
+            )
             if not warmup:
                 print(f"\n{LIGHT_BLUE}Assistant:{RESET_COLOR}", end=' ', flush=True)
+            # Initialize token printer and collect response
             token_printer = TokenPrinter(tokenizer)
+            response_tokens = []
+            generation_start_time = time.time()
             try:
+                # Run prefill on entire context
                 current_pos = run_prefill(
                     embed_model,
                     ffn_models,
                     context_pos,
                     context_length,
                     batch_size,
+                    state,
+                    causal_mask
                 )
+                #print(f"\n[DEBUG] After initial prefill - current_pos: {current_pos}")
+                # Generation loop
                 pos = context_pos
+                tokens_generated = 0
+                inference_start = time.time()  # Start inference timing
+                while True:
+                    # Check if we need to shift window
+                    if pos >= context_length - 2:
+                        # Calculate shift to maintain full batches
+                        batch_size = metadata.get('batch_size', 64)
+                        # Calculate max batches that fit in context
+                        max_batches = context_length // batch_size
+                        desired_batches = max(1, max_batches - 2)  # Leave room for new tokens
+                        new_size = min(desired_batches * batch_size, context_length - batch_size)
+                        # Create shifted input_ids
+                        tmp = torch.zeros((1, context_length), dtype=torch.int32)
+                        tmp[:,0:new_size] = input_ids[:,pos-new_size:pos]
+                        input_ids = tmp
+                        # Reset state and run prefill
+                        # keep the same state
+                        #state = create_unified_state(ffn_models, context_length)
+                        current_pos = run_prefill(
+                            embed_model,
+                            ffn_models,
+                            input_ids,
+                            new_size,  # Prefill the entire shifted content
+                            context_length,
+                            batch_size,
+                            state,
+                            causal_mask
+                        )
+                        # Start generating from the next position
+                        pos = new_size  # Don't back up, continue from where we left off
+                        #print(f"\n[DEBUG] After shift - next token will be at pos {pos}")
+                        #print(f"[DEBUG] Context before next token: {tokenizer.decode(input_ids[0, pos-40:pos])}")
+                        window_shifted = True
                     # Generate next token
                     next_token = generate_next_token(
                         embed_model,
                         input_ids,
                         pos,
                         context_length,
+                        state,
+                        causal_mask
                     )
+                    # Add token
+                    input_ids[0, pos] = next_token
                     if not warmup:
                         token_printer.add_token(next_token)
                         token_printer.drain_buffer()
+                    response_tokens.append(next_token)
                     pos += 1
                     tokens_generated += 1
+                    # In warmup mode, limit tokens
                     if warmup and tokens_generated >= WARMUP_TOKEN_LIMIT:
                         break
                     if next_token == tokenizer.eos_token_id:
                         break
+                inference_time = time.time() - inference_start  # Calculate inference time
+                # Add assistant response to conversation
+                response_text = token_printer.stop()
+                conversation.append({"role": "assistant", "content": response_text})
+                # Print stats only if not in warmup
                 if not warmup:
+                    total_time = time.time() - generation_start_time
+                    prefill_time = total_time - inference_time
+                    inference_tokens_per_sec = len(response_tokens) / inference_time if inference_time > 0 else 0
+                    prefill_ms = prefill_time * 1000
+                    prefill_tokens_per_sec = context_pos / prefill_time if prefill_time > 0 else 0
+                    print(f"{DARK_BLUE}{inference_tokens_per_sec:.1f} t/s, "
+                          f"TTFT: {prefill_ms:.1f}ms ({prefill_tokens_per_sec:.1f} t/s), "
+                          f"{len(response_tokens)} tokens{RESET_COLOR}")
                 if auto_prompt is not None:
                     break
             except KeyboardInterrupt:
+                if not warmup:
+                    print("\nGeneration interrupted")
                 token_printer.stop()
                 continue
     except Exception as e:
+        if not warmup:
+            print(f"\nError in chat loop: {str(e)}")
+            import traceback
+            traceback.print_exc()
 def main():
     args = parse_args()
         # Create unified state once
         state = create_unified_state(ffn_models, metadata['context_length'])
+        # Initialize causal mask once
+        causal_mask = initialize_causal_mask(metadata['context_length'])
         # Warmup runs to prevent Python GIL issues with CoreML !
         if not args.nw:
             for i in range(2):
                     lmhead_model=lmhead_model,
                     tokenizer=tokenizer,
                     metadata=metadata,
+                    state=state,  # Pass the state
+                    causal_mask=causal_mask,  # Pass the causal mask
                     warmup=True,
                     auto_prompt="who are you?"
                 )
             lmhead_model=lmhead_model,
             tokenizer=tokenizer,
             metadata=metadata,
+            state=state,  # Pass the state
+            causal_mask=causal_mask,  # Pass the causal mask
             warmup=False,
             auto_prompt=args.prompt
         )