Spaces:
Running
on
Zero
Running
on
Zero
Change to local inference
Browse files- activations/candidate_vectors.pt +3 -0
- activations/deepseek-1.5b-candidate_vectors.pt +3 -0
- activations/deepseek-1.5b-offsets.pt +3 -0
- activations/offsets.pt +3 -0
- app.py +52 -143
- model.py +110 -0
- requirements.txt +8 -0
- scheduler.py +1 -1
- schemas.py +4 -11
activations/candidate_vectors.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ed63186d01ddaf6df8835818144185b5fb05d1c9a4683fce9517a921472353b3
|
| 3 |
+
size 804046
|
activations/deepseek-1.5b-candidate_vectors.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4a4f3701085a9090e78fc402aaaef5adbf23f0b49c932f82eb4fc107d191aac0
|
| 3 |
+
size 345294
|
activations/deepseek-1.5b-offsets.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d84cb880bee5feb83248b476d8d0f3f87dca74bc8ae53807f3ab2a9bdb959920
|
| 3 |
+
size 345244
|
activations/offsets.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c6cb10bd9014f9cd2470d37f56f491abd5f72bd162543a7569f40cd385f127c3
|
| 3 |
+
size 803996
|
app.py
CHANGED
|
@@ -1,35 +1,25 @@
|
|
| 1 |
-
import
|
| 2 |
import logging
|
| 3 |
from pathlib import Path
|
| 4 |
-
import
|
| 5 |
-
import
|
| 6 |
import pandas as pd
|
| 7 |
import gradio as gr
|
| 8 |
from gradio_toggle import Toggle
|
|
|
|
|
|
|
| 9 |
from scheduler import load_scheduler
|
| 10 |
from schemas import UserRequest, SteeringOutput, CONFIG
|
| 11 |
|
| 12 |
-
|
| 13 |
-
MAX_RETRIES = 10
|
| 14 |
-
MAX_RETRY_WAIT_TIME = 75
|
| 15 |
-
MIN_RETRY_WAIT_TIME = 5
|
| 16 |
-
ENDPOINT_ALIVE = False
|
| 17 |
-
|
| 18 |
-
HF_TOKEN = os.getenv('HF_TOKEN')
|
| 19 |
-
API_URL = "https://a6k5m81qw14hkvhz.us-east-1.aws.endpoints.huggingface.cloud"
|
| 20 |
-
headers = {
|
| 21 |
-
"Accept" : "application/json",
|
| 22 |
-
"Authorization": f"Bearer {HF_TOKEN}",
|
| 23 |
-
"Content-Type": "application/json"
|
| 24 |
-
}
|
| 25 |
-
|
| 26 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(name)s %(levelname)s:%(message)s')
|
| 27 |
logger = logging.getLogger(__name__)
|
| 28 |
|
| 29 |
model_name = "DeepSeek-R1-Distill-Qwen-7B"
|
| 30 |
examples = pd.read_csv("assets/examples.csv")
|
|
|
|
| 31 |
instances = {}
|
| 32 |
scheduler = load_scheduler()
|
|
|
|
| 33 |
|
| 34 |
|
| 35 |
HEAD = """
|
|
@@ -198,8 +188,6 @@ def initialize_instance(request: gr.Request):
|
|
| 198 |
|
| 199 |
|
| 200 |
def cleanup_instance(request: gr.Request):
|
| 201 |
-
global ENDPOINT_ALIVE
|
| 202 |
-
|
| 203 |
session_id = request.session_hash
|
| 204 |
|
| 205 |
if session_id in instances:
|
|
@@ -209,51 +197,48 @@ def cleanup_instance(request: gr.Request):
|
|
| 209 |
|
| 210 |
del instances[session_id]
|
| 211 |
|
| 212 |
-
if len(instances) == 0:
|
| 213 |
-
ENDPOINT_ALIVE = False
|
| 214 |
-
|
| 215 |
logger.info("Number of connections: %d", len(instances))
|
| 216 |
|
| 217 |
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
resp_text = await resp.text()
|
| 223 |
-
if resp.status == 200:
|
| 224 |
-
alive = True
|
| 225 |
-
else:
|
| 226 |
-
logger.error("API Error Code: %d, Message: %s", resp.status, resp_text)
|
| 227 |
-
|
| 228 |
-
await session.close()
|
| 229 |
-
return alive
|
| 230 |
|
|
|
|
| 231 |
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
|
|
|
|
|
|
|
|
|
| 239 |
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 244 |
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
gr.Info("Inference endpoint is ready")
|
| 248 |
-
yield "Ready"
|
| 249 |
-
break
|
| 250 |
-
|
| 251 |
-
gr.Warning("Initializing inference endpoint\n(This may take 2~3 minutes)", duration=sleep_time)
|
| 252 |
-
await asyncio.sleep(sleep_time)
|
| 253 |
-
sleep_time = max(sleep_time * 0.8, MIN_RETRY_WAIT_TIME)
|
| 254 |
|
| 255 |
-
|
| 256 |
-
yield "Server Error"
|
| 257 |
|
| 258 |
|
| 259 |
async def post_process(session_id, output):
|
|
@@ -266,62 +251,11 @@ async def post_process(session_id, output):
|
|
| 266 |
answer = None
|
| 267 |
else:
|
| 268 |
answer = p[-1]
|
| 269 |
-
else:
|
| 270 |
-
answer = None
|
| 271 |
-
reasoning = output
|
| 272 |
-
|
| 273 |
-
steering_output = SteeringOutput(**req.model_dump(), reasoning=reasoning, answer=answer)
|
| 274 |
-
instances[session_id].append(steering_output)
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
class Generator:
|
| 278 |
-
def __init__(self):
|
| 279 |
-
self.stop_events = {}
|
| 280 |
-
|
| 281 |
-
async def stop(self, session_id):
|
| 282 |
-
self.stop_events[session_id] = True
|
| 283 |
-
logger.info("Stopping generation")
|
| 284 |
-
|
| 285 |
-
async def generate(
|
| 286 |
-
self, session_id: str, prompt: str, steering: bool, coeff: float,
|
| 287 |
-
max_new_tokens: int, top_p: float, temperature: float, layer: int, vec_scaling: float
|
| 288 |
-
):
|
| 289 |
-
req = UserRequest(
|
| 290 |
-
session_id=session_id, prompt=prompt, steering=steering, coeff=coeff,
|
| 291 |
-
max_new_tokens=max_new_tokens, top_p=top_p, temperature=temperature, vec_scale=vec_scaling, layer=layer
|
| 292 |
-
)
|
| 293 |
-
|
| 294 |
-
instances[session_id].append(req)
|
| 295 |
-
|
| 296 |
-
data = req.get_api_format()
|
| 297 |
-
logger.info("User Request: %s", data)
|
| 298 |
-
|
| 299 |
-
generated_text = ""
|
| 300 |
-
self.stop_events[session_id] = False
|
| 301 |
-
|
| 302 |
-
try:
|
| 303 |
-
async with aiohttp.ClientSession() as session:
|
| 304 |
-
async with session.post(f"{API_URL}/generate", headers=headers, json=data) as resp:
|
| 305 |
-
if resp.status == 200:
|
| 306 |
-
generated_text += "<think>"
|
| 307 |
-
|
| 308 |
-
async for chunk, _ in resp.content.iter_chunks():
|
| 309 |
-
if self.stop_events[session_id]:
|
| 310 |
-
break
|
| 311 |
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
else:
|
| 315 |
-
logger.error("API Error Ccode: %d, Error Message: %s", resp.status, resp.text())
|
| 316 |
-
raise gr.Error("API Server Error")
|
| 317 |
-
|
| 318 |
-
except:
|
| 319 |
-
logger.info("Client session error")
|
| 320 |
|
| 321 |
-
|
| 322 |
-
await post_process(session_id, generated_text)
|
| 323 |
-
|
| 324 |
-
del self.stop_events[session_id]
|
| 325 |
|
| 326 |
|
| 327 |
async def output_feedback(session_id, feedback):
|
|
@@ -339,31 +273,13 @@ async def output_feedback(session_id, feedback):
|
|
| 339 |
logger.debug("Feedback submission error")
|
| 340 |
|
| 341 |
|
| 342 |
-
async def show_feedback_buttons(upvote_btn, downvote_btn):
|
| 343 |
-
return gr.update(interactive=True), gr.update(interactive=True)
|
| 344 |
-
|
| 345 |
-
|
| 346 |
gr.set_static_paths(paths=[Path.cwd().absolute() / "assets"])
|
| 347 |
theme = gr.themes.Base(primary_hue="emerald", text_size=gr.themes.sizes.text_lg).set()
|
| 348 |
-
generator = Generator()
|
| 349 |
|
| 350 |
with gr.Blocks(title="LLM Censorship Steering", theme=theme, head=HEAD, css=CSS, js=JS) as demo:
|
| 351 |
session_id = gr.State()
|
| 352 |
-
endpoint_state = gr.State(get_endpoint_state)
|
| 353 |
-
|
| 354 |
gr.HTML(HTML)
|
| 355 |
-
|
| 356 |
-
@gr.render(inputs=endpoint_state, triggers=[endpoint_state.change])
|
| 357 |
-
def render_state(endpoint_state):
|
| 358 |
-
if endpoint_state == "Ready":
|
| 359 |
-
color = "green"
|
| 360 |
-
elif endpoint_state == "Server Error":
|
| 361 |
-
color = "red"
|
| 362 |
-
else:
|
| 363 |
-
color = "orange"
|
| 364 |
-
|
| 365 |
-
if endpoint_state != None:
|
| 366 |
-
gr.Markdown(f'🤖 {model_name} | Inference Endpoint State: <span style="color:{color}; font-weight: bold;">{endpoint_state}</span>', elem_id="model-state")
|
| 367 |
|
| 368 |
with gr.Row(elem_id="main-components"):
|
| 369 |
with gr.Column(scale=1):
|
|
@@ -382,7 +298,6 @@ with gr.Blocks(title="LLM Censorship Steering", theme=theme, head=HEAD, css=CSS,
|
|
| 382 |
|
| 383 |
with gr.Row():
|
| 384 |
clear_btn = gr.ClearButton()
|
| 385 |
-
stop_btn = gr.Button("Stop")
|
| 386 |
generate_btn = gr.Button("Generate", variant="primary")
|
| 387 |
|
| 388 |
with gr.Accordion("⚙️ Advanced Settings", open=False):
|
|
@@ -408,25 +323,19 @@ with gr.Blocks(title="LLM Censorship Steering", theme=theme, head=HEAD, css=CSS,
|
|
| 408 |
gr.Examples(examples=examples[examples["type"] == "harmful"].prompt.tolist(), inputs=input_text, label="Harmful")
|
| 409 |
|
| 410 |
|
| 411 |
-
@gr.on(triggers=[clear_btn.click
|
| 412 |
-
def
|
| 413 |
return gr.update(interactive=False), gr.update(interactive=False)
|
| 414 |
-
|
| 415 |
-
@gr.on(triggers=[generate_btn.click], outputs=[upvote_btn, downvote_btn])
|
| 416 |
-
def show_feedback_buttons():
|
| 417 |
-
return gr.update(interactive=True), gr.update(interactive=True)
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
submission = generate_btn.click(
|
| 421 |
-
generator.generate, inputs=[session_id, input_text, steer_toggle, coeff, max_new_tokens, top_p, temperature, layer, vec_scaling], outputs=output
|
| 422 |
-
)
|
| 423 |
|
| 424 |
clear_btn.add([input_text, output])
|
| 425 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 426 |
|
| 427 |
upvote_btn.click(output_feedback, inputs=[session_id, upvote_btn])
|
| 428 |
downvote_btn.click(output_feedback, inputs=[session_id, downvote_btn])
|
| 429 |
-
|
| 430 |
layer.change(fn=lambda x: 1, inputs=vec_scaling, outputs=vec_scaling)
|
| 431 |
|
| 432 |
demo.load(initialize_instance, outputs=session_id)
|
|
|
|
| 1 |
+
import threading
|
| 2 |
import logging
|
| 3 |
from pathlib import Path
|
| 4 |
+
from typing import Dict
|
| 5 |
+
import spaces
|
| 6 |
import pandas as pd
|
| 7 |
import gradio as gr
|
| 8 |
from gradio_toggle import Toggle
|
| 9 |
+
from transformers import TextIteratorStreamer
|
| 10 |
+
from model import load_model
|
| 11 |
from scheduler import load_scheduler
|
| 12 |
from schemas import UserRequest, SteeringOutput, CONFIG
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(name)s %(levelname)s:%(message)s')
|
| 15 |
logger = logging.getLogger(__name__)
|
| 16 |
|
| 17 |
model_name = "DeepSeek-R1-Distill-Qwen-7B"
|
| 18 |
examples = pd.read_csv("assets/examples.csv")
|
| 19 |
+
|
| 20 |
instances = {}
|
| 21 |
scheduler = load_scheduler()
|
| 22 |
+
model = load_model()
|
| 23 |
|
| 24 |
|
| 25 |
HEAD = """
|
|
|
|
| 188 |
|
| 189 |
|
| 190 |
def cleanup_instance(request: gr.Request):
|
|
|
|
|
|
|
| 191 |
session_id = request.session_hash
|
| 192 |
|
| 193 |
if session_id in instances:
|
|
|
|
| 197 |
|
| 198 |
del instances[session_id]
|
| 199 |
|
|
|
|
|
|
|
|
|
|
| 200 |
logger.info("Number of connections: %d", len(instances))
|
| 201 |
|
| 202 |
|
| 203 |
+
@spaces.GPU(duration=90)
|
| 204 |
+
def generate(prompt: str, steering: bool, coeff: float, generation_config: Dict[str, float], layer: int, k: float):
|
| 205 |
+
formatted_prompt = model.apply_chat_template(prompt)
|
| 206 |
+
inputs = model.tokenize(formatted_prompt)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
|
| 208 |
+
streamer = TextIteratorStreamer(model.tokenizer, timeout=10, skip_prompt=True, skip_special_tokens=True)
|
| 209 |
|
| 210 |
+
if steering:
|
| 211 |
+
thread = threading.Thread(
|
| 212 |
+
target=model.steer_generation,
|
| 213 |
+
args=(inputs, streamer, k, layer, coeff, generation_config)
|
| 214 |
+
)
|
| 215 |
+
else:
|
| 216 |
+
thread = threading.Thread(
|
| 217 |
+
target=model.run_generation,
|
| 218 |
+
args=(inputs, streamer, generation_config)
|
| 219 |
+
)
|
| 220 |
|
| 221 |
+
thread.start()
|
| 222 |
+
|
| 223 |
+
generated_text = "<think>"
|
| 224 |
+
for new_text in streamer:
|
| 225 |
+
generated_text += new_text
|
| 226 |
+
yield generated_text
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
def generate_output(
|
| 230 |
+
session_id: str, prompt: str, steering: bool, coeff: float,
|
| 231 |
+
max_new_tokens: int, top_p: float, temperature: float, layer: int, vec_scaling: float
|
| 232 |
+
):
|
| 233 |
+
req = UserRequest(
|
| 234 |
+
session_id=session_id, prompt=prompt, steering=steering, coeff=coeff,
|
| 235 |
+
max_new_tokens=max_new_tokens, top_p=top_p, temperature=temperature, vec_scale=vec_scaling, layer=layer
|
| 236 |
+
)
|
| 237 |
|
| 238 |
+
logger.info("User request: %s", req)
|
| 239 |
+
instances[session_id].append(req)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
|
| 241 |
+
yield from generate(prompt, steering, coeff, req.generation_config(), layer, req.k)
|
|
|
|
| 242 |
|
| 243 |
|
| 244 |
async def post_process(session_id, output):
|
|
|
|
| 251 |
answer = None
|
| 252 |
else:
|
| 253 |
answer = p[-1]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 254 |
|
| 255 |
+
steering_output = SteeringOutput(**req.model_dump(), reasoning=reasoning, answer=answer)
|
| 256 |
+
instances[session_id].append(steering_output)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 257 |
|
| 258 |
+
return gr.update(interactive=True), gr.update(interactive=True)
|
|
|
|
|
|
|
|
|
|
| 259 |
|
| 260 |
|
| 261 |
async def output_feedback(session_id, feedback):
|
|
|
|
| 273 |
logger.debug("Feedback submission error")
|
| 274 |
|
| 275 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 276 |
gr.set_static_paths(paths=[Path.cwd().absolute() / "assets"])
|
| 277 |
theme = gr.themes.Base(primary_hue="emerald", text_size=gr.themes.sizes.text_lg).set()
|
|
|
|
| 278 |
|
| 279 |
with gr.Blocks(title="LLM Censorship Steering", theme=theme, head=HEAD, css=CSS, js=JS) as demo:
|
| 280 |
session_id = gr.State()
|
|
|
|
|
|
|
| 281 |
gr.HTML(HTML)
|
| 282 |
+
gr.Markdown(f'🤖 {model_name}')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 283 |
|
| 284 |
with gr.Row(elem_id="main-components"):
|
| 285 |
with gr.Column(scale=1):
|
|
|
|
| 298 |
|
| 299 |
with gr.Row():
|
| 300 |
clear_btn = gr.ClearButton()
|
|
|
|
| 301 |
generate_btn = gr.Button("Generate", variant="primary")
|
| 302 |
|
| 303 |
with gr.Accordion("⚙️ Advanced Settings", open=False):
|
|
|
|
| 323 |
gr.Examples(examples=examples[examples["type"] == "harmful"].prompt.tolist(), inputs=input_text, label="Harmful")
|
| 324 |
|
| 325 |
|
| 326 |
+
@gr.on(triggers=[clear_btn.click], outputs=[upvote_btn, downvote_btn])
|
| 327 |
+
def clear():
|
| 328 |
return gr.update(interactive=False), gr.update(interactive=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 329 |
|
| 330 |
clear_btn.add([input_text, output])
|
| 331 |
+
generate_btn.click(
|
| 332 |
+
generate_output, inputs=[session_id, input_text, steer_toggle, coeff, max_new_tokens, top_p, temperature, layer, vec_scaling], outputs=output
|
| 333 |
+
).success(
|
| 334 |
+
post_process, inputs=[session_id, output], outputs=[upvote_btn, downvote_btn]
|
| 335 |
+
)
|
| 336 |
|
| 337 |
upvote_btn.click(output_feedback, inputs=[session_id, upvote_btn])
|
| 338 |
downvote_btn.click(output_feedback, inputs=[session_id, downvote_btn])
|
|
|
|
| 339 |
layer.change(fn=lambda x: 1, inputs=vec_scaling, outputs=vec_scaling)
|
| 340 |
|
| 341 |
demo.load(initialize_instance, outputs=session_id)
|
model.py
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os, warnings
|
| 2 |
+
from operator import attrgetter
|
| 3 |
+
from typing import List, Dict
|
| 4 |
+
|
| 5 |
+
import torch
|
| 6 |
+
import torch.nn.functional as F
|
| 7 |
+
from torchtyping import TensorType
|
| 8 |
+
from transformers import TextIteratorStreamer
|
| 9 |
+
from transformers import AutoTokenizer, BatchEncoding
|
| 10 |
+
import nnsight
|
| 11 |
+
from nnsight import LanguageModel
|
| 12 |
+
from nnsight.intervention import Envoy
|
| 13 |
+
|
| 14 |
+
warnings.filterwarnings("ignore")
|
| 15 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
| 16 |
+
|
| 17 |
+
# nnsight with multi-threading: https://github.com/ndif-team/nnsight/issues/280
|
| 18 |
+
nnsight.CONFIG.APP.GLOBAL_TRACING = False
|
| 19 |
+
|
| 20 |
+
config = {
|
| 21 |
+
"model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
|
| 22 |
+
"steering_vec": "activations/candidate_vectors.pt",
|
| 23 |
+
"offset": "activations/offsets.pt",
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
def detect_module_attrs(model: LanguageModel) -> str:
|
| 27 |
+
if "model" in model._modules and "layers" in model.model._modules:
|
| 28 |
+
return "model.layers"
|
| 29 |
+
elif "transformers" in model._modules and "h" in model.transformers._modules:
|
| 30 |
+
return "transformers.h"
|
| 31 |
+
else:
|
| 32 |
+
raise Exception("Failed to detect module attributes.")
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
class ModelBase:
|
| 36 |
+
def __init__(
|
| 37 |
+
self, model_name: str,
|
| 38 |
+
steering_vecs: TensorType, offsets: TensorType,
|
| 39 |
+
tokenizer: AutoTokenizer = None, block_module_attr=None
|
| 40 |
+
):
|
| 41 |
+
if tokenizer is None:
|
| 42 |
+
self.tokenizer = self._load_tokenizer(model_name)
|
| 43 |
+
else:
|
| 44 |
+
self.tokenizer = tokenizer
|
| 45 |
+
self.model = self._load_model(model_name, self.tokenizer)
|
| 46 |
+
|
| 47 |
+
self.device = self.model.device
|
| 48 |
+
self.hidden_size = self.model.config.hidden_size
|
| 49 |
+
if block_module_attr is None:
|
| 50 |
+
self.block_modules = self.get_module(detect_module_attrs(self.model))
|
| 51 |
+
else:
|
| 52 |
+
self.block_modules = self.get_module(block_module_attr)
|
| 53 |
+
|
| 54 |
+
self.steering_vecs = F.normalize(steering_vecs, dim=-1)
|
| 55 |
+
self.steering_vecs, self.offsets = self.set_dtype(self.steering_vecs, offsets)
|
| 56 |
+
|
| 57 |
+
def _load_model(self, model_name: str, tokenizer: AutoTokenizer) -> LanguageModel:
|
| 58 |
+
return LanguageModel(model_name, tokenizer=tokenizer, dispatch=True, trust_remote_code=True, device_map="auto", torch_dtype=torch.bfloat16)
|
| 59 |
+
|
| 60 |
+
def _load_tokenizer(self, model_name) -> AutoTokenizer:
|
| 61 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
| 62 |
+
tokenizer.padding_side = "left"
|
| 63 |
+
if not tokenizer.pad_token:
|
| 64 |
+
tokenizer.pad_token_id = tokenizer.eos_token_id
|
| 65 |
+
tokenizer.pad_token = tokenizer.eos_token
|
| 66 |
+
|
| 67 |
+
tokenizer.chat_template = tokenizer.chat_template.replace("<|Assistant|><think>\\n", "<|Assistant|><think>")
|
| 68 |
+
return tokenizer
|
| 69 |
+
|
| 70 |
+
def tokenize(self, prompt: str) -> BatchEncoding:
|
| 71 |
+
return self.tokenizer(prompt, padding=True, truncation=False, return_tensors="pt")
|
| 72 |
+
|
| 73 |
+
def get_module(self, attr: str) -> Envoy:
|
| 74 |
+
return attrgetter(attr)(self.model)
|
| 75 |
+
|
| 76 |
+
def set_dtype(self, *vars):
|
| 77 |
+
if len(vars) == 1:
|
| 78 |
+
return vars[0].to(self.model.dtype)
|
| 79 |
+
else:
|
| 80 |
+
return (var.to(self.model.dtype) for var in vars)
|
| 81 |
+
|
| 82 |
+
def apply_chat_template(self, instruction: str) -> List[str]:
|
| 83 |
+
messages = [{"role": "user", "content": instruction}]
|
| 84 |
+
return self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 85 |
+
|
| 86 |
+
def run_generation(self, inputs, streamer: TextIteratorStreamer, generation_config: Dict):
|
| 87 |
+
inputs = inputs.to(self.device)
|
| 88 |
+
_ = self.model._model.generate(**inputs, do_sample=True, streamer=streamer, **generation_config)
|
| 89 |
+
|
| 90 |
+
def steer_generation(
|
| 91 |
+
self, inputs, streamer: TextIteratorStreamer, k: float,
|
| 92 |
+
layer: int, coeff: float, generation_config: Dict
|
| 93 |
+
):
|
| 94 |
+
layer_block = self.block_modules[layer]
|
| 95 |
+
unit_vec = self.steering_vecs[layer]
|
| 96 |
+
offset = self.offsets[layer]
|
| 97 |
+
|
| 98 |
+
with self.model.generate(inputs, do_sample=True, streamer=streamer, **generation_config):
|
| 99 |
+
with self.block_modules.all():
|
| 100 |
+
acts = layer_block.output[0].clone()
|
| 101 |
+
proj = (acts - offset) @ unit_vec.unsqueeze(-1) * unit_vec
|
| 102 |
+
layer_block.output[0][:] = acts - proj + coeff * k * unit_vec
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def load_model() -> ModelBase:
|
| 106 |
+
steering_vecs = torch.load(config['steering_vec'], weights_only=True)
|
| 107 |
+
offsets = torch.load(config['offset'], weights_only=True)
|
| 108 |
+
model = ModelBase(config['model_name'], steering_vecs=steering_vecs, offsets=offsets)
|
| 109 |
+
return model
|
| 110 |
+
|
requirements.txt
CHANGED
|
@@ -2,3 +2,11 @@ aiohttp==3.11.16
|
|
| 2 |
pandas==2.2.2
|
| 3 |
pyarrow==19.0.1
|
| 4 |
gradio_toggle==2.0.2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
pandas==2.2.2
|
| 3 |
pyarrow==19.0.1
|
| 4 |
gradio_toggle==2.0.2
|
| 5 |
+
transformers==4.50.0
|
| 6 |
+
accelerate==1.6.0
|
| 7 |
+
nnsight==0.4.3
|
| 8 |
+
triton==3.1.0
|
| 9 |
+
torchtyping==0.1.5
|
| 10 |
+
tiktoken==0.8.0
|
| 11 |
+
transformers_stream_generator==0.0.5
|
| 12 |
+
zstandard==0.23.0
|
scheduler.py
CHANGED
|
@@ -14,7 +14,7 @@ logger = logging.getLogger(__name__)
|
|
| 14 |
|
| 15 |
def load_scheduler():
|
| 16 |
return ParquetScheduler(
|
| 17 |
-
repo_id="hannahcyberey/Censorship-Steering-Logs", every=
|
| 18 |
private=True,
|
| 19 |
squash_history=False,
|
| 20 |
schema={
|
|
|
|
| 14 |
|
| 15 |
def load_scheduler():
|
| 16 |
return ParquetScheduler(
|
| 17 |
+
repo_id="hannahcyberey/Censorship-Steering-Logs", every=60,
|
| 18 |
private=True,
|
| 19 |
squash_history=False,
|
| 20 |
schema={
|
schemas.py
CHANGED
|
@@ -32,18 +32,11 @@ class UserRequest(BaseModel):
|
|
| 32 |
else:
|
| 33 |
self.k = self.vec_scale * vector_scaling[self.layer]["k_neg"]
|
| 34 |
|
| 35 |
-
def
|
| 36 |
return {
|
| 37 |
-
"
|
| 38 |
-
"
|
| 39 |
-
"
|
| 40 |
-
"k": self.k,
|
| 41 |
-
"layer": self.layer,
|
| 42 |
-
"generation_config": {
|
| 43 |
-
"max_new_tokens": self.max_new_tokens,
|
| 44 |
-
"top_p": self.top_p,
|
| 45 |
-
"temperature": self.temperature
|
| 46 |
-
}
|
| 47 |
}
|
| 48 |
|
| 49 |
|
|
|
|
| 32 |
else:
|
| 33 |
self.k = self.vec_scale * vector_scaling[self.layer]["k_neg"]
|
| 34 |
|
| 35 |
+
def generation_config(self):
|
| 36 |
return {
|
| 37 |
+
"max_new_tokens": self.max_new_tokens,
|
| 38 |
+
"top_p": self.top_p,
|
| 39 |
+
"temperature": self.temperature
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
}
|
| 41 |
|
| 42 |
|