tsq2000
/

Jailbreak-generator

@@ -1,277 +1,272 @@
-**# Introduction**
-This is a model for generating a jailbreak prompt based on knowledge point texts. The model is trained on the Llama-2-7b dataset and fine-tuned on the Knowledge-to-Jailbreak dataset. The model is intended to bridge the gap between theoretical vulnerabilities and real-world application scenarios, simulating sophisticated adversarial attacks that incorporate specialized knowledge.
-Our proposed method and dataset serve as a critical starting point for both offensive and defensive research, enabling the development of new techniques to enhance the security and robustness of language models in practical settings.
-**# How to load the model and tokenizer**
-We provide two helper functions for loading the model and tokenizer.
-\```python
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForTokenClassification
-import os
-import json
-from peft import PeftModel
-\# from trl import AutoModelForCausalLMWithValueHead
-from transformers import AutoModelForCausalLM as AutoGPTQForCausalLM
-def load_tokenizer(dir_or_model):
-    """
-    This function is used to load the tokenizer for a specific pre-trained model.
-    Args:
-        dir_or_model: It can be either a directory containing the pre-training model configuration details or a pretrained model.
-    Returns:
-        It returns a tokenizer that can convert text to tokens for the specific model input.
-    """
-    is_lora_dir = os.path.isfile(os.path.join(dir_or_model, "adapter_config.json"))
-    if is_lora_dir:
-        loaded_json = json.load(open(os.path.join(dir_or_model, "adapter_config.json"), "r"))
-        model_name = loaded_json["base_model_name_or_path"]
-    else:
-        model_name = dir_or_model
-    if os.path.isfile(os.path.join(dir_or_model, "config.json")):
-        loaded_json = json.load(open(os.path.join(dir_or_model, "config.json"), "r"))
-        if "_name_or_path" in loaded_json:
-            model_name = loaded_json["_name_or_path"]
-    local_model_name = "/data3/MODELS/llama2-hf/llama-2-7b"#/data2/tsq/WaterBench/data/models/llama-2-7b-chat-hf
-    print(">>>>>>>>>>>>>>>>>>>>>>>>>>notice this<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
-    \#print(model_name)
-    tokenizer = AutoTokenizer.from_pretrained(local_model_name)
-    if tokenizer.pad_token is None:
-        tokenizer.pad_token = tokenizer.eos_token
-        tokenizer.pad_token_id = tokenizer.eos_token_id
-    return tokenizer
-def load_model(dir_or_model, classification=False, token_classification=False, return_tokenizer=False, dtype=torch.bfloat16, load_dtype=True,
-                rl=False, peft_config=None, device_map="auto", revision='main'):
-    """
-    This function is used to load a model based on several parameters including the type of task it is targeted to perform.
-    Args:
-        dir_or_model: It can be either a directory containing the pre-training model configuration details or a pretrained model.
-        classification (bool): If True, loads the model for sequence classification.
-        token_classification (bool): If True, loads the model for token classification.
-        return_tokenizer (bool): If True, returns the tokenizer along with the model.
-        dtype: The data type that PyTorch should use internally to store the model’s parameters and do the computation.
-        load_dtype (bool): If False, sets dtype as torch.float32 regardless of the passed dtype value.
-        rl (bool): If True, loads model specifically designed to be used in reinforcement learning environment.
-        peft_config: Configuration details for Peft models.
-    Returns:
-        It returns a model for the required task along with its tokenizer, if specified.
-    """
-    is_lora_dir = os.path.isfile(os.path.join(dir_or_model, "adapter_config.json"))
-    if not load_dtype:
-        dtype = torch.float32
-    if is_lora_dir:
-        loaded_json = json.load(open(os.path.join(dir_or_model, "adapter_config.json"), "r"))
-        model_name = loaded_json["base_model_name_or_path"]
-    else:
-        model_name = dir_or_model
-    original_model_name = model_name
-    \#local_model_name = "/data1/tsq/zkj_use/MODELS/phi-2"
-    \#local_model_name = "/data2/tsq/WaterBench/data/models/llama-2-7b-chat-hf"
-    \#local_model_name = "/data3/MODELS/llama2-hf/llama-2-7b"
-    \#print(model_name)
-    if classification:
-        model = AutoModelForSequenceClassification.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float32, use_auth_token=True, device_map=device_map, revision=revision)  # to investigate: calling torch_dtype here fails.
-    elif token_classification:
-        model = AutoModelForTokenClassification.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float32, use_auth_token=True, device_map=device_map, revision=revision)
-    \# elif rl:
-    \#     model = AutoModelForCausalLMWithValueHead.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float32, use_auth_token=True,
-    \#                                                               peft_config=peft_config, device_map=device_map, revision=revision)
-    else:
-        if model_name.endswith("GPTQ") or model_name.endswith("GGML"):
-            model = AutoGPTQForCausalLM.from_quantized(model_name,
-                                                        use_safetensors=True,
-                                                        trust_remote_code=True,
-                                                        \# use_triton=True, # breaks currently, unfortunately generation time of the GPTQ model is quite slow
-                                                        quantize_config=None, device_map=device_map)
-        else:
-            print('11111111111111111111111111111111111111')
-            model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float32, use_auth_token=True, device_map=device_map, revision=revision)
-    if is_lora_dir:
-        model = PeftModel.from_pretrained(model, dir_or_model)
-    try:
-        tokenizer = load_tokenizer(original_model_name)
-        model.config.pad_token_id = tokenizer.pad_token_id
-    except Exception:
-        pass
-    if return_tokenizer:
-        return model, load_tokenizer(original_model_name)
-    return model
-model_name = 'tsq2000/Jailbreak-generator'
-model = load_model(model_name)
-tokenizer = load_tokenizer(model_name)
-\```
-**# How to generate jailbreak prompts**
-Here is an example of how to generate jailbreak prompts based on knowledge point texts.
-\```python
-model_name = 'tsq2000/Jailbreak-generator'
-model = load_model(model_name)
-tokenizer = load_tokenizer(model_name)
-max_length = 2048
-max_tokens = 64
-knowledge_points = ["Kettling Kettling (also known as containment or corralling) is a police tactic for controlling large crowds during demonstrations or protests. It involves the formation of large cordons of police officers who then move to contain a crowd within a limited area. Protesters are left only one choice of exit controlled by the police – or are completely prevented from leaving, with the effect of denying the protesters access to food, water and toilet facilities for a time period determined by the police forces. The tactic has proved controversial, in part because it has resulted in the detention of ordinary bystanders."]
-batch_texts = [f'### Input:\n{input_}\n\n### Response:\n' for input_ in knowledge_points]
-inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=max_length - max_tokens).to(model.device)
-outputs = model.generate(**inputs, max_new_tokens=max_tokens,      num_return_sequences=1, do_sample=False, temperature=1, top_p=1, eos_token_id=tokenizer.eos_token_id)
-generated_texts = []
-for output, input_text in zip(outputs, batch_texts):
-    text = tokenizer.decode(output, skip_special_tokens=True)
-    generated_texts.append(text[len(input_text):])
-print(generated_texts)
-\```
-**# Citation**
-If you find this model useful, please cite the following paper:
-\```
-@misc{tu2024knowledgetojailbreak,
-      title={Knowledge-to-Jailbreak: One Knowledge Point Worth One Attack},
-      author={Shangqing Tu and Zhuoran Pan and Wenxuan Wang and Zhexin Zhang and Yuliang Sun and Jifan Yu and Hongning Wang and Lei Hou and Juanzi Li},
-      year={2024},
-      eprint={2406.11682},
-      archivePrefix={arXiv},
-      primaryClass={cs.CL},
-      url={https://arxiv.org/abs/2406.11682},
-}
-\```

+---
+license: mit
+language:
+- en
+tags:
+- llm
+- safety
+- jailbreak
+- knowledge
+---
+# Introduction
+This is a model for generating a jailbreak prompt based on knowledge point texts. The model is trained on the Llama-2-7b dataset and fine-tuned on the Knowledge-to-Jailbreak dataset. The model is intended to bridge the gap between theoretical vulnerabilities and real-world application scenarios, simulating sophisticated adversarial attacks that incorporate specialized knowledge.
+Our proposed method and dataset serve as a critical starting point for both offensive and defensive research, enabling the development of new techniques to enhance the security and robustness of language models in practical settings.
+# How to load the model and tokenizer
+We provide two helper functions for loading the model and tokenizer.
+```python
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForTokenClassification
+import os
+import json
+from peft import PeftModel
+# from trl import AutoModelForCausalLMWithValueHead
+from transformers import AutoModelForCausalLM as AutoGPTQForCausalLM
+def load_tokenizer(dir_or_model):
+    """
+    This function is used to load the tokenizer for a specific pre-trained model.
+    Args:
+        dir_or_model: It can be either a directory containing the pre-training model configuration details or a pretrained model.
+    Returns:
+        It returns a tokenizer that can convert text to tokens for the specific model input.
+    """
+    is_lora_dir = os.path.isfile(os.path.join(dir_or_model, "adapter_config.json"))
+    if is_lora_dir:
+        loaded_json = json.load(open(os.path.join(dir_or_model, "adapter_config.json"), "r"))
+        model_name = loaded_json["base_model_name_or_path"]
+    else:
+        model_name = dir_or_model
+    if os.path.isfile(os.path.join(dir_or_model, "config.json")):
+        loaded_json = json.load(open(os.path.join(dir_or_model, "config.json"), "r"))
+        if "_name_or_path" in loaded_json:
+            model_name = loaded_json["_name_or_path"]
+    local_model_name = "/data3/MODELS/llama2-hf/llama-2-7b"#/data2/tsq/WaterBench/data/models/llama-2-7b-chat-hf
+    print(">>>>>>>>>>>>>>>>>>>>>>>>>>notice this<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
+    #print(model_name)
+    tokenizer = AutoTokenizer.from_pretrained(local_model_name)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+    return tokenizer
+def load_model(dir_or_model, classification=False, token_classification=False, return_tokenizer=False, dtype=torch.bfloat16, load_dtype=True,
+                rl=False, peft_config=None, device_map="auto", revision='main'):
+    """
+    This function is used to load a model based on several parameters including the type of task it is targeted to perform.
+    Args:
+        dir_or_model: It can be either a directory containing the pre-training model configuration details or a pretrained model.
+        classification (bool): If True, loads the model for sequence classification.
+        token_classification (bool): If True, loads the model for token classification.
+        return_tokenizer (bool): If True, returns the tokenizer along with the model.
+        dtype: The data type that PyTorch should use internally to store the model’s parameters and do the computation.
+        load_dtype (bool): If False, sets dtype as torch.float32 regardless of the passed dtype value.
+        rl (bool): If True, loads model specifically designed to be used in reinforcement learning environment.
+        peft_config: Configuration details for Peft models.
+    Returns:
+        It returns a model for the required task along with its tokenizer, if specified.
+    """
+    is_lora_dir = os.path.isfile(os.path.join(dir_or_model, "adapter_config.json"))
+    if not load_dtype:
+        dtype = torch.float32
+    if is_lora_dir:
+        loaded_json = json.load(open(os.path.join(dir_or_model, "adapter_config.json"), "r"))
+        model_name = loaded_json["base_model_name_or_path"]
+    else:
+        model_name = dir_or_model
+    original_model_name = model_name
+    if classification:
+        model = AutoModelForSequenceClassification.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float32, use_auth_token=True, device_map=device_map, revision=revision)  # to investigate: calling torch_dtype here fails.
+    elif token_classification:
+        model = AutoModelForTokenClassification.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float32, use_auth_token=True, device_map=device_map, revision=revision)
+    else:
+        if model_name.endswith("GPTQ") or model_name.endswith("GGML"):
+            model = AutoGPTQForCausalLM.from_quantized(model_name,
+                                                        use_safetensors=True,
+                                                        trust_remote_code=True,
+                                                        \# use_triton=True, # breaks currently, unfortunately generation time of the GPTQ model is quite slow
+                                                        quantize_config=None, device_map=device_map)
+        else:
+            print('11111111111111111111111111111111111111')
+            model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float32, use_auth_token=True, device_map=device_map, revision=revision)
+    if is_lora_dir:
+        model = PeftModel.from_pretrained(model, dir_or_model)
+    try:
+        tokenizer = load_tokenizer(original_model_name)
+        model.config.pad_token_id = tokenizer.pad_token_id
+    except Exception:
+        pass
+    if return_tokenizer:
+        return model, load_tokenizer(original_model_name)
+    return model
+model_name = 'tsq2000/Jailbreak-generator'
+model = load_model(model_name)
+tokenizer = load_tokenizer(model_name)
+```
+# How to generate jailbreak prompts
+Here is an example of how to generate jailbreak prompts based on knowledge point texts.
+```python
+model_name = 'tsq2000/Jailbreak-generator'
+model = load_model(model_name)
+tokenizer = load_tokenizer(model_name)
+max_length = 2048
+max_tokens = 64
+knowledge_points = ["Kettling Kettling (also known as containment or corralling) is a police tactic for controlling large crowds during demonstrations or protests. It involves the formation of large cordons of police officers who then move to contain a crowd within a limited area. Protesters are left only one choice of exit controlled by the police – or are completely prevented from leaving, with the effect of denying the protesters access to food, water and toilet facilities for a time period determined by the police forces. The tactic has proved controversial, in part because it has resulted in the detention of ordinary bystanders."]
+batch_texts = [f'### Input:\n{input_}\n\n### Response:\n' for input_ in knowledge_points]
+inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=max_length - max_tokens).to(model.device)
+outputs = model.generate(**inputs, max_new_tokens=max_tokens,      num_return_sequences=1, do_sample=False, temperature=1, top_p=1, eos_token_id=tokenizer.eos_token_id)
+generated_texts = []
+for output, input_text in zip(outputs, batch_texts):
+    text = tokenizer.decode(output, skip_special_tokens=True)
+    generated_texts.append(text[len(input_text):])
+print(generated_texts)
+```
+# Citation
+If you find this model useful, please cite the following paper:
+```
+@misc{tu2024knowledgetojailbreak,
+      title={Knowledge-to-Jailbreak: One Knowledge Point Worth One Attack},
+      author={Shangqing Tu and Zhuoran Pan and Wenxuan Wang and Zhexin Zhang and Yuliang Sun and Jifan Yu and Hongning Wang and Lei Hou and Juanzi Li},
+      year={2024},
+      eprint={2406.11682},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2406.11682},
+}
+```