tsq2000 commited on
Commit
76770f7
β€’
1 Parent(s): c76eefa

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +272 -277
README.md CHANGED
@@ -1,277 +1,272 @@
1
- **# Introduction**
2
-
3
- This is a model for generating a jailbreak prompt based on knowledge point texts. The model is trained on the Llama-2-7b dataset and fine-tuned on the Knowledge-to-Jailbreak dataset. The model is intended to bridge the gap between theoretical vulnerabilities and real-world application scenarios, simulating sophisticated adversarial attacks that incorporate specialized knowledge.
4
-
5
- Our proposed method and dataset serve as a critical starting point for both offensive and defensive research, enabling the development of new techniques to enhance the security and robustness of language models in practical settings.
6
-
7
- **# How to load the model and tokenizer**
8
-
9
- We provide two helper functions for loading the model and tokenizer.
10
-
11
- \```python
12
-
13
- import torch
14
-
15
- from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForTokenClassification
16
-
17
- import os
18
-
19
- import json
20
-
21
- from peft import PeftModel
22
-
23
- \# from trl import AutoModelForCausalLMWithValueHead
24
-
25
- from transformers import AutoModelForCausalLM as AutoGPTQForCausalLM
26
-
27
- def load_tokenizer(dir_or_model):
28
-
29
- ​ """
30
-
31
- ​ This function is used to load the tokenizer for a specific pre-trained model.
32
-
33
- ​
34
-
35
- ​ Args:
36
-
37
- ​ dir_or_model: It can be either a directory containing the pre-training model configuration details or a pretrained model.
38
-
39
- ​
40
-
41
- ​ Returns:
42
-
43
- ​ It returns a tokenizer that can convert text to tokens for the specific model input.
44
-
45
- ​ """
46
-
47
- ​ is_lora_dir = os.path.isfile(os.path.join(dir_or_model, "adapter_config.json"))
48
-
49
- ​ if is_lora_dir:
50
-
51
- ​ loaded_json = json.load(open(os.path.join(dir_or_model, "adapter_config.json"), "r"))
52
-
53
- ​ model_name = loaded_json["base_model_name_or_path"]
54
-
55
- ​ else:
56
-
57
- ​ model_name = dir_or_model
58
-
59
- ​
60
-
61
- ​ if os.path.isfile(os.path.join(dir_or_model, "config.json")):
62
-
63
- ​ loaded_json = json.load(open(os.path.join(dir_or_model, "config.json"), "r"))
64
-
65
- ​ if "_name_or_path" in loaded_json:
66
-
67
- ​ model_name = loaded_json["_name_or_path"]
68
-
69
- ​ local_model_name = "/data3/MODELS/llama2-hf/llama-2-7b"#/data2/tsq/WaterBench/data/models/llama-2-7b-chat-hf
70
-
71
- ​
72
-
73
- ​ print(">>>>>>>>>>>>>>>>>>>>>>>>>>notice this<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
74
-
75
- ​
76
-
77
- ​ \#print(model_name)
78
-
79
- ​ tokenizer = AutoTokenizer.from_pretrained(local_model_name)
80
-
81
- ​ if tokenizer.pad_token is None:
82
-
83
- ​ tokenizer.pad_token = tokenizer.eos_token
84
-
85
- ​ tokenizer.pad_token_id = tokenizer.eos_token_id
86
-
87
- ​
88
-
89
- ​ return tokenizer
90
-
91
- def load_model(dir_or_model, classification=False, token_classification=False, return_tokenizer=False, dtype=torch.bfloat16, load_dtype=True,
92
-
93
- ​ rl=False, peft_config=None, device_map="auto", revision='main'):
94
-
95
- ​ """
96
-
97
- ​ This function is used to load a model based on several parameters including the type of task it is targeted to perform.
98
-
99
- ​
100
-
101
- ​ Args:
102
-
103
- ​ dir_or_model: It can be either a directory containing the pre-training model configuration details or a pretrained model.
104
-
105
- ​ classification (bool): If True, loads the model for sequence classification.
106
-
107
- ​ token_classification (bool): If True, loads the model for token classification.
108
-
109
- ​ return_tokenizer (bool): If True, returns the tokenizer along with the model.
110
-
111
- ​ dtype: The data type that PyTorch should use internally to store the model’s parameters and do the computation.
112
-
113
- ​ load_dtype (bool): If False, sets dtype as torch.float32 regardless of the passed dtype value.
114
-
115
- ​ rl (bool): If True, loads model specifically designed to be used in reinforcement learning environment.
116
-
117
- ​ peft_config: Configuration details for Peft models.
118
-
119
- ​
120
-
121
- ​ Returns:
122
-
123
- ​ It returns a model for the required task along with its tokenizer, if specified.
124
-
125
- ​ """
126
-
127
- ​ is_lora_dir = os.path.isfile(os.path.join(dir_or_model, "adapter_config.json"))
128
-
129
- ​ if not load_dtype:
130
-
131
- ​ dtype = torch.float32
132
-
133
- ​ if is_lora_dir:
134
-
135
- ​ loaded_json = json.load(open(os.path.join(dir_or_model, "adapter_config.json"), "r"))
136
-
137
- ​ model_name = loaded_json["base_model_name_or_path"]
138
-
139
- ​ else:
140
-
141
- ​ model_name = dir_or_model
142
-
143
- ​ original_model_name = model_name
144
-
145
- ​ \#local_model_name = "/data1/tsq/zkj_use/MODELS/phi-2"
146
-
147
- ​ \#local_model_name = "/data2/tsq/WaterBench/data/models/llama-2-7b-chat-hf"
148
-
149
- ​ \#local_model_name = "/data3/MODELS/llama2-hf/llama-2-7b"
150
-
151
- ​ \#print(model_name)
152
-
153
- ​ if classification:
154
-
155
- ​ model = AutoModelForSequenceClassification.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float32, use_auth_token=True, device_map=device_map, revision=revision) # to investigate: calling torch_dtype here fails.
156
-
157
- ​ elif token_classification:
158
-
159
- ​ model = AutoModelForTokenClassification.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float32, use_auth_token=True, device_map=device_map, revision=revision)
160
-
161
- ​ \# elif rl:
162
-
163
- ​ \# model = AutoModelForCausalLMWithValueHead.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float32, use_auth_token=True,
164
-
165
- ​ \# peft_config=peft_config, device_map=device_map, revision=revision)
166
-
167
- ​ else:
168
-
169
- ​ if model_name.endswith("GPTQ") or model_name.endswith("GGML"):
170
-
171
- ​ model = AutoGPTQForCausalLM.from_quantized(model_name,
172
-
173
- ​ use_safetensors=True,
174
-
175
- ​ trust_remote_code=True,
176
-
177
- ​ \# use_triton=True, # breaks currently, unfortunately generation time of the GPTQ model is quite slow
178
-
179
- ​ quantize_config=None, device_map=device_map)
180
-
181
- ​ else:
182
-
183
- ​ print('11111111111111111111111111111111111111')
184
-
185
- ​ model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float32, use_auth_token=True, device_map=device_map, revision=revision)
186
-
187
- ​ if is_lora_dir:
188
-
189
- ​ model = PeftModel.from_pretrained(model, dir_or_model)
190
-
191
- ​
192
-
193
- ​ try:
194
-
195
- ​ tokenizer = load_tokenizer(original_model_name)
196
-
197
- ​ model.config.pad_token_id = tokenizer.pad_token_id
198
-
199
- ​ except Exception:
200
-
201
- ​ pass
202
-
203
- ​ if return_tokenizer:
204
-
205
- ​ return model, load_tokenizer(original_model_name)
206
-
207
- ​ return model
208
-
209
- model_name = 'tsq2000/Jailbreak-generator'
210
-
211
- model = load_model(model_name)
212
-
213
- tokenizer = load_tokenizer(model_name)
214
-
215
- \```
216
-
217
- **# How to generate jailbreak prompts**
218
-
219
- Here is an example of how to generate jailbreak prompts based on knowledge point texts.
220
-
221
- \```python
222
-
223
- model_name = 'tsq2000/Jailbreak-generator'
224
-
225
- model = load_model(model_name)
226
-
227
- tokenizer = load_tokenizer(model_name)
228
-
229
- max_length = 2048
230
-
231
- max_tokens = 64
232
-
233
- knowledge_points = ["Kettling Kettling (also known as containment or corralling) is a police tactic for controlling large crowds during demonstrations or protests. It involves the formation of large cordons of police officers who then move to contain a crowd within a limited area. Protesters are left only one choice of exit controlled by the police – or are completely prevented from leaving, with the effect of denying the protesters access to food, water and toilet facilities for a time period determined by the police forces. The tactic has proved controversial, in part because it has resulted in the detention of ordinary bystanders."]
234
-
235
- batch_texts = [f'### Input:\n{input_}\n\n### Response:\n' for input_ in knowledge_points]
236
-
237
- inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=max_length - max_tokens).to(model.device)
238
-
239
- outputs = model.generate(**inputs, max_new_tokens=max_tokens, num_return_sequences=1, do_sample=False, temperature=1, top_p=1, eos_token_id=tokenizer.eos_token_id)
240
-
241
- generated_texts = []
242
-
243
- for output, input_text in zip(outputs, batch_texts):
244
-
245
- ​ text = tokenizer.decode(output, skip_special_tokens=True)
246
-
247
- ​ generated_texts.append(text[len(input_text):])
248
-
249
- print(generated_texts)
250
-
251
- \```
252
-
253
- **# Citation**
254
-
255
- If you find this model useful, please cite the following paper:
256
-
257
- \```
258
-
259
- @misc{tu2024knowledgetojailbreak,
260
-
261
- ​ title={Knowledge-to-Jailbreak: One Knowledge Point Worth One Attack},
262
-
263
- ​ author={Shangqing Tu and Zhuoran Pan and Wenxuan Wang and Zhexin Zhang and Yuliang Sun and Jifan Yu and Hongning Wang and Lei Hou and Juanzi Li},
264
-
265
- ​ year={2024},
266
-
267
- ​ eprint={2406.11682},
268
-
269
- ​ archivePrefix={arXiv},
270
-
271
- ​ primaryClass={cs.CL},
272
-
273
- ​ url={https://arxiv.org/abs/2406.11682},
274
-
275
- }
276
-
277
- \```
 
1
+ ---
2
+ license: mit
3
+ language:
4
+ - en
5
+ tags:
6
+ - llm
7
+ - safety
8
+ - jailbreak
9
+ - knowledge
10
+ ---
11
+ # Introduction
12
+
13
+ This is a model for generating a jailbreak prompt based on knowledge point texts. The model is trained on the Llama-2-7b dataset and fine-tuned on the Knowledge-to-Jailbreak dataset. The model is intended to bridge the gap between theoretical vulnerabilities and real-world application scenarios, simulating sophisticated adversarial attacks that incorporate specialized knowledge.
14
+
15
+ Our proposed method and dataset serve as a critical starting point for both offensive and defensive research, enabling the development of new techniques to enhance the security and robustness of language models in practical settings.
16
+
17
+ # How to load the model and tokenizer
18
+
19
+ We provide two helper functions for loading the model and tokenizer.
20
+
21
+ ```python
22
+
23
+ import torch
24
+
25
+ from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForTokenClassification
26
+
27
+ import os
28
+
29
+ import json
30
+
31
+ from peft import PeftModel
32
+
33
+ # from trl import AutoModelForCausalLMWithValueHead
34
+
35
+ from transformers import AutoModelForCausalLM as AutoGPTQForCausalLM
36
+
37
+ def load_tokenizer(dir_or_model):
38
+
39
+ ​ """
40
+
41
+ ​ This function is used to load the tokenizer for a specific pre-trained model.
42
+
43
+ ​
44
+
45
+ ​ Args:
46
+
47
+ ​ dir_or_model: It can be either a directory containing the pre-training model configuration details or a pretrained model.
48
+
49
+ ​
50
+
51
+ ​ Returns:
52
+
53
+ ​ It returns a tokenizer that can convert text to tokens for the specific model input.
54
+
55
+ ​ """
56
+
57
+ ​ is_lora_dir = os.path.isfile(os.path.join(dir_or_model, "adapter_config.json"))
58
+
59
+ ​ if is_lora_dir:
60
+
61
+ ​ loaded_json = json.load(open(os.path.join(dir_or_model, "adapter_config.json"), "r"))
62
+
63
+ ​ model_name = loaded_json["base_model_name_or_path"]
64
+
65
+ ​ else:
66
+
67
+ ​ model_name = dir_or_model
68
+
69
+ ​
70
+
71
+ ​ if os.path.isfile(os.path.join(dir_or_model, "config.json")):
72
+
73
+ ​ loaded_json = json.load(open(os.path.join(dir_or_model, "config.json"), "r"))
74
+
75
+ ​ if "_name_or_path" in loaded_json:
76
+
77
+ ​ model_name = loaded_json["_name_or_path"]
78
+
79
+ ​ local_model_name = "/data3/MODELS/llama2-hf/llama-2-7b"#/data2/tsq/WaterBench/data/models/llama-2-7b-chat-hf
80
+
81
+ ​
82
+
83
+ ​ print(">>>>>>>>>>>>>>>>>>>>>>>>>>notice this<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
84
+
85
+ ​
86
+
87
+ ​ #print(model_name)
88
+
89
+ ​ tokenizer = AutoTokenizer.from_pretrained(local_model_name)
90
+
91
+ ​ if tokenizer.pad_token is None:
92
+
93
+ ​ tokenizer.pad_token = tokenizer.eos_token
94
+
95
+ ​ tokenizer.pad_token_id = tokenizer.eos_token_id
96
+
97
+ ​
98
+
99
+ ​ return tokenizer
100
+
101
+ def load_model(dir_or_model, classification=False, token_classification=False, return_tokenizer=False, dtype=torch.bfloat16, load_dtype=True,
102
+
103
+ ​ rl=False, peft_config=None, device_map="auto", revision='main'):
104
+
105
+ ​ """
106
+
107
+ ​ This function is used to load a model based on several parameters including the type of task it is targeted to perform.
108
+
109
+ ​
110
+
111
+ ​ Args:
112
+
113
+ ​ dir_or_model: It can be either a directory containing the pre-training model configuration details or a pretrained model.
114
+
115
+ ​ classification (bool): If True, loads the model for sequence classification.
116
+
117
+ ​ token_classification (bool): If True, loads the model for token classification.
118
+
119
+ ​ return_tokenizer (bool): If True, returns the tokenizer along with the model.
120
+
121
+ ​ dtype: The data type that PyTorch should use internally to store the model’s parameters and do the computation.
122
+
123
+ ​ load_dtype (bool): If False, sets dtype as torch.float32 regardless of the passed dtype value.
124
+
125
+ ​ rl (bool): If True, loads model specifically designed to be used in reinforcement learning environment.
126
+
127
+ ​ peft_config: Configuration details for Peft models.
128
+
129
+ ​
130
+
131
+ ​ Returns:
132
+
133
+ ​ It returns a model for the required task along with its tokenizer, if specified.
134
+
135
+ ​ """
136
+
137
+ ​ is_lora_dir = os.path.isfile(os.path.join(dir_or_model, "adapter_config.json"))
138
+
139
+ ​ if not load_dtype:
140
+
141
+ ​ dtype = torch.float32
142
+
143
+ ​ if is_lora_dir:
144
+
145
+ ​ loaded_json = json.load(open(os.path.join(dir_or_model, "adapter_config.json"), "r"))
146
+
147
+ ​ model_name = loaded_json["base_model_name_or_path"]
148
+
149
+ ​ else:
150
+
151
+ ​ model_name = dir_or_model
152
+
153
+ ​ original_model_name = model_name
154
+
155
+ ​ if classification:
156
+
157
+ ​ model = AutoModelForSequenceClassification.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float32, use_auth_token=True, device_map=device_map, revision=revision) # to investigate: calling torch_dtype here fails.
158
+
159
+ ​ elif token_classification:
160
+
161
+ ​ model = AutoModelForTokenClassification.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float32, use_auth_token=True, device_map=device_map, revision=revision)
162
+
163
+ ​ else:
164
+
165
+ ​ if model_name.endswith("GPTQ") or model_name.endswith("GGML"):
166
+
167
+ ​ model = AutoGPTQForCausalLM.from_quantized(model_name,
168
+
169
+ ​ use_safetensors=True,
170
+
171
+ ​ trust_remote_code=True,
172
+
173
+ ​ \# use_triton=True, # breaks currently, unfortunately generation time of the GPTQ model is quite slow
174
+
175
+ ​ quantize_config=None, device_map=device_map)
176
+
177
+ ​ else:
178
+
179
+ ​ print('11111111111111111111111111111111111111')
180
+
181
+ ​ model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float32, use_auth_token=True, device_map=device_map, revision=revision)
182
+
183
+ ​ if is_lora_dir:
184
+
185
+ ​ model = PeftModel.from_pretrained(model, dir_or_model)
186
+
187
+ ​
188
+
189
+ ​ try:
190
+
191
+ ​ tokenizer = load_tokenizer(original_model_name)
192
+
193
+ ​ model.config.pad_token_id = tokenizer.pad_token_id
194
+
195
+ ​ except Exception:
196
+
197
+ ​ pass
198
+
199
+ ​ if return_tokenizer:
200
+
201
+ ​ return model, load_tokenizer(original_model_name)
202
+
203
+ ​ return model
204
+
205
+ model_name = 'tsq2000/Jailbreak-generator'
206
+
207
+ model = load_model(model_name)
208
+
209
+ tokenizer = load_tokenizer(model_name)
210
+
211
+ ```
212
+
213
+ # How to generate jailbreak prompts
214
+
215
+ Here is an example of how to generate jailbreak prompts based on knowledge point texts.
216
+
217
+ ```python
218
+
219
+ model_name = 'tsq2000/Jailbreak-generator'
220
+
221
+ model = load_model(model_name)
222
+
223
+ tokenizer = load_tokenizer(model_name)
224
+
225
+ max_length = 2048
226
+
227
+ max_tokens = 64
228
+
229
+ knowledge_points = ["Kettling Kettling (also known as containment or corralling) is a police tactic for controlling large crowds during demonstrations or protests. It involves the formation of large cordons of police officers who then move to contain a crowd within a limited area. Protesters are left only one choice of exit controlled by the police – or are completely prevented from leaving, with the effect of denying the protesters access to food, water and toilet facilities for a time period determined by the police forces. The tactic has proved controversial, in part because it has resulted in the detention of ordinary bystanders."]
230
+
231
+ batch_texts = [f'### Input:\n{input_}\n\n### Response:\n' for input_ in knowledge_points]
232
+
233
+ inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=max_length - max_tokens).to(model.device)
234
+
235
+ outputs = model.generate(**inputs, max_new_tokens=max_tokens, num_return_sequences=1, do_sample=False, temperature=1, top_p=1, eos_token_id=tokenizer.eos_token_id)
236
+
237
+ generated_texts = []
238
+
239
+ for output, input_text in zip(outputs, batch_texts):
240
+
241
+ ​ text = tokenizer.decode(output, skip_special_tokens=True)
242
+
243
+ ​ generated_texts.append(text[len(input_text):])
244
+
245
+ print(generated_texts)
246
+
247
+ ```
248
+
249
+ # Citation
250
+
251
+ If you find this model useful, please cite the following paper:
252
+
253
+ ```
254
+ @misc{tu2024knowledgetojailbreak,
255
+
256
+ ​ title={Knowledge-to-Jailbreak: One Knowledge Point Worth One Attack},
257
+
258
+ ​ author={Shangqing Tu and Zhuoran Pan and Wenxuan Wang and Zhexin Zhang and Yuliang Sun and Jifan Yu and Hongning Wang and Lei Hou and Juanzi Li},
259
+
260
+ ​ year={2024},
261
+
262
+ ​ eprint={2406.11682},
263
+
264
+ ​ archivePrefix={arXiv},
265
+
266
+ ​ primaryClass={cs.CL},
267
+
268
+ ​ url={https://arxiv.org/abs/2406.11682},
269
+
270
+ }
271
+
272
+ ```