---
license: mit
language:
- en
tags:
- llm
- safety
- jailbreak
- knowledge
---
# Introduction

This is a model for generating a jailbreak prompt based on knowledge point texts. The model is trained on the Llama-2-7b dataset and fine-tuned on the Knowledge-to-Jailbreak dataset. The model is intended to bridge the gap between theoretical vulnerabilities and real-world application scenarios, simulating sophisticated adversarial attacks that incorporate specialized knowledge. 

Our proposed method and dataset serve as a critical starting point for both offensive and defensive research, enabling the development of new techniques to enhance the security and robustness of language models in practical settings.

# How to load the model and tokenizer

We provide two helper functions for loading the model and tokenizer.

```python

import torch

from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForTokenClassification

import os

import json

from peft import PeftModel

# from trl import AutoModelForCausalLMWithValueHead

from transformers import AutoModelForCausalLM as AutoGPTQForCausalLM

def load_tokenizer(dir_or_model):

​    """

​    This function is used to load the tokenizer for a specific pre-trained model.

​    

​    Args:

​        dir_or_model: It can be either a directory containing the pre-training model configuration details or a pretrained model.

​    

​    Returns:

​        It returns a tokenizer that can convert text to tokens for the specific model input.

​    """

​    is_lora_dir = os.path.isfile(os.path.join(dir_or_model, "adapter_config.json"))

​    if is_lora_dir:

​        loaded_json = json.load(open(os.path.join(dir_or_model, "adapter_config.json"), "r"))

​        model_name = loaded_json["base_model_name_or_path"]

​    else:

​        model_name = dir_or_model

​        

​    if os.path.isfile(os.path.join(dir_or_model, "config.json")):

​        loaded_json = json.load(open(os.path.join(dir_or_model, "config.json"), "r"))

​        if "_name_or_path" in loaded_json:

​            model_name = loaded_json["_name_or_path"]

​    local_model_name = "/data3/MODELS/llama2-hf/llama-2-7b"#/data2/tsq/WaterBench/data/models/llama-2-7b-chat-hf

​    

​    print(">>>>>>>>>>>>>>>>>>>>>>>>>>notice this<<<<<<<<<<<<<<<<<<<<<<<<<<<<")

​    

​    #print(model_name)

​    tokenizer = AutoTokenizer.from_pretrained(local_model_name)

​    if tokenizer.pad_token is None:

​        tokenizer.pad_token = tokenizer.eos_token

​        tokenizer.pad_token_id = tokenizer.eos_token_id

​    

​    return tokenizer

def load_model(dir_or_model, classification=False, token_classification=False, return_tokenizer=False, dtype=torch.bfloat16, load_dtype=True, 

​                rl=False, peft_config=None, device_map="auto", revision='main'):

​    """

​    This function is used to load a model based on several parameters including the type of task it is targeted to perform.

​    

​    Args:

​        dir_or_model: It can be either a directory containing the pre-training model configuration details or a pretrained model.

​        classification (bool): If True, loads the model for sequence classification.

​        token_classification (bool): If True, loads the model for token classification.

​        return_tokenizer (bool): If True, returns the tokenizer along with the model.

​        dtype: The data type that PyTorch should use internally to store the model’s parameters and do the computation.

​        load_dtype (bool): If False, sets dtype as torch.float32 regardless of the passed dtype value.

​        rl (bool): If True, loads model specifically designed to be used in reinforcement learning environment.

​        peft_config: Configuration details for Peft models. 

​    

​    Returns:

​        It returns a model for the required task along with its tokenizer, if specified.

​    """

​    is_lora_dir = os.path.isfile(os.path.join(dir_or_model, "adapter_config.json"))

​    if not load_dtype:

​        dtype = torch.float32

​    if is_lora_dir:

​        loaded_json = json.load(open(os.path.join(dir_or_model, "adapter_config.json"), "r"))

​        model_name = loaded_json["base_model_name_or_path"]

​    else:

​        model_name = dir_or_model

​    original_model_name = model_name

​    if classification:

​        model = AutoModelForSequenceClassification.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float32, use_auth_token=True, device_map=device_map, revision=revision)  # to investigate: calling torch_dtype here fails.

​    elif token_classification:

​        model = AutoModelForTokenClassification.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float32, use_auth_token=True, device_map=device_map, revision=revision)

​    else:

​        if model_name.endswith("GPTQ") or model_name.endswith("GGML"):

​            model = AutoGPTQForCausalLM.from_quantized(model_name,

​                                                        use_safetensors=True,

​                                                        trust_remote_code=True,

​                                                        \# use_triton=True, # breaks currently, unfortunately generation time of the GPTQ model is quite slow

​                                                        quantize_config=None, device_map=device_map)

​        else:

​            print('11111111111111111111111111111111111111')

​            model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float32, use_auth_token=True, device_map=device_map, revision=revision)

​    if is_lora_dir:

​        model = PeftModel.from_pretrained(model, dir_or_model)

​        

​    try:

​        tokenizer = load_tokenizer(original_model_name)

​        model.config.pad_token_id = tokenizer.pad_token_id

​    except Exception:

​        pass

​    if return_tokenizer:

​        return model, load_tokenizer(original_model_name)

​    return model

model_name = 'tsq2000/Jailbreak-generator'

model = load_model(model_name)

tokenizer = load_tokenizer(model_name)

```

# How to generate jailbreak prompts

Here is an example of how to generate jailbreak prompts based on knowledge point texts.

```python

model_name = 'tsq2000/Jailbreak-generator'

model = load_model(model_name)

tokenizer = load_tokenizer(model_name)

max_length = 2048

max_tokens = 64

knowledge_points = ["Kettling Kettling (also known as containment or corralling) is a police tactic for controlling large crowds during demonstrations or protests. It involves the formation of large cordons of police officers who then move to contain a crowd within a limited area. Protesters are left only one choice of exit controlled by the police – or are completely prevented from leaving, with the effect of denying the protesters access to food, water and toilet facilities for a time period determined by the police forces. The tactic has proved controversial, in part because it has resulted in the detention of ordinary bystanders."]

batch_texts = [f'### Input:\n{input_}\n\n### Response:\n' for input_ in knowledge_points]

inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=max_length - max_tokens).to(model.device)

outputs = model.generate(**inputs, max_new_tokens=max_tokens,      num_return_sequences=1, do_sample=False, temperature=1, top_p=1, eos_token_id=tokenizer.eos_token_id)

generated_texts = []

for output, input_text in zip(outputs, batch_texts):

​    text = tokenizer.decode(output, skip_special_tokens=True)

​    generated_texts.append(text[len(input_text):])

print(generated_texts)

```

# Citation

If you find this model useful, please cite the following paper:

```
@misc{tu2024knowledgetojailbreak,

​      title={Knowledge-to-Jailbreak: One Knowledge Point Worth One Attack}, 

​      author={Shangqing Tu and Zhuoran Pan and Wenxuan Wang and Zhexin Zhang and Yuliang Sun and Jifan Yu and Hongning Wang and Lei Hou and Juanzi Li},

​      year={2024},

​      eprint={2406.11682},

​      archivePrefix={arXiv},

​      primaryClass={cs.CL},

​      url={https://arxiv.org/abs/2406.11682}, 

}

```