sudoaza commited on
Commit
00892f8
0 Parent(s):

initial commit

Browse files
README.md ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # Rockdich
2
+
3
+ Toolkit to translate password dictionaries into German, but possibly to other languages. Includes tools to generate the training dataset by using local ollama or OpenAI API. Fine tune a llama3 model into the translation task using Unsloth. A script to translate a password dictionary using the finetuned model. And finally a translation into German of rockyou.txt as rockdich.txt. Each script has a description comment in the same file.
4
+
5
+ See huggingface repo for model. https://huggingface.co/sudoaza/rockdich
build_dataset.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Code to augment the translated/untranslated passwords and create a dataset for the password translation task."""
2
+
3
+ import pandas as pd
4
+ import random
5
+
6
+ N_SAMPLES = 10000
7
+
8
+ def mutate_password_pair(pair):
9
+ # 20% of the times we will capitalize the first letter
10
+ if random.random() < 0.2:
11
+ pair = (pair[0].capitalize(), pair[1].capitalize())
12
+ # 20% of the times we will add a number at the end
13
+ if random.random() < 0.2:
14
+ number = random.randint(0, 9)
15
+ pair = (pair[0] + str(number), pair[1] + str(number))
16
+ # 20% of the times we will add a symbol at the end
17
+ if random.random() < 0.2:
18
+ symbol = random.choice(['!', '@', '#', '$', '%', '&', '*'])
19
+ pair = (pair[0] + symbol, pair[1] + symbol)
20
+ # 20% of the tims we will replace a letter with a number
21
+ if random.random() < 0.2:
22
+ if "e" in pair[0]:
23
+ letter = "e"
24
+ number = "3"
25
+ elif "E" in pair[0]:
26
+ letter = "E"
27
+ number = "3"
28
+ elif "i" in pair[0]:
29
+ letter = "i"
30
+ number = "1"
31
+ elif "I" in pair[0]:
32
+ letter = "I"
33
+ number = "1"
34
+ elif "o" in pair[0]:
35
+ letter = "o"
36
+ number = "0"
37
+ elif "O" in pair[0]:
38
+ letter = "O"
39
+ number = "0"
40
+ elif "a" in pair[0]:
41
+ letter = "a"
42
+ number = "4"
43
+ elif "A" in pair[0]:
44
+ letter = "A"
45
+ number = "4"
46
+ elif "t" in pair[0]:
47
+ letter = "t"
48
+ number = "7"
49
+ elif "T" in pair[0]:
50
+ letter = "T"
51
+ number = "7"
52
+ else:
53
+ return pair
54
+
55
+ # replace only first occurrence
56
+ pair = (pair[0].replace(letter, number, 1), pair[1].replace(letter, number, 1))
57
+ return pair
58
+
59
+ def create_dataframes():
60
+ # Read the files
61
+ with open('original_train.txt', 'r', encoding='latin1') as file:
62
+ original = file.readlines()
63
+ with open('translated_train.txt', 'r', encoding='utf-8') as file:
64
+ translated = file.readlines()
65
+ with open('untranslated.txt', 'r', encoding='latin1') as file:
66
+ untranslated = file.readlines()
67
+
68
+ # Create a dataframe from original and translated lists
69
+ df_translated = pd.DataFrame({
70
+ 'original': [line.strip() for line in original],
71
+ 'translated': [line.strip() for line in translated]
72
+ })
73
+
74
+ # List for untranslated
75
+ untranslated_list = [line.strip() for line in untranslated]
76
+
77
+ # Create an empty dataframe for instructions
78
+ df_instructions = pd.DataFrame(columns=['instruction', 'input', 'output'])
79
+
80
+ # Generate 100 instruction rows (arbitrary choice to generate a substantial sample)
81
+ for _ in range(N_SAMPLES):
82
+ # Randomly pick 8 translated pairs
83
+ sampled_translated = df_translated.sample(8)
84
+ original_samples = sampled_translated['original'].tolist()
85
+ translated_samples = sampled_translated['translated'].tolist()
86
+
87
+ # Randomly pick 2 untranslated passwords
88
+ untranslated_samples = random.sample(untranslated_list, 2)
89
+
90
+ # Combine and shuffle maintaining pairing
91
+ total_input = original_samples + untranslated_samples
92
+ total_output = translated_samples + untranslated_samples
93
+
94
+ combined_list = list(zip(total_input, total_output))
95
+ random.shuffle(combined_list)
96
+ combined_list = [mutate_password_pair(pair) for pair in combined_list]
97
+ shuffled_input, shuffled_output = zip(*combined_list)
98
+
99
+ new_rows = {
100
+ 'instruction': 'Translate this passwords while keeping the original format.',
101
+ 'input': "\n".join(list(shuffled_input)),
102
+ 'output': "\n".join(list(shuffled_output))
103
+ }
104
+ df_instructions = df_instructions._append(new_rows, ignore_index=True)
105
+
106
+ return df_instructions
107
+
108
+ # Generate the dataframe
109
+ df_instructions = create_dataframes()
110
+
111
+ # Output to check
112
+ print(df_instructions.head())
113
+
114
+ # Saving the new DataFrame to a CSV (optional)
115
+ df_instructions.to_csv('password_translation_instructions.csv', index=False)
password_translation_instructions.csv ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ xformers
2
+ trl
3
+ peft
4
+ accelerate
5
+ bitsandbytes
split_translations.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Split passwords into translated and untranslated"""
2
+
3
+ def read_passwords(file_path):
4
+ """Read passwords from a file and return a list of those passwords."""
5
+ with open(file_path, 'r', encoding='latin1') as file:
6
+ return file.read().splitlines()
7
+
8
+ def compare_passwords(file_path1, file_path2):
9
+ """Compare passwords from two files and categorize them, preserving order."""
10
+ passwords1 = read_passwords(file_path1)
11
+ passwords2 = read_passwords(file_path2)
12
+ unique_passwords_1 = []
13
+ unique_passwords_2 = []
14
+ common_passwords = []
15
+ for i in range(len(passwords1)):
16
+ if passwords1[i] == passwords2[i]:
17
+ common_passwords.append(passwords1[i])
18
+ else:
19
+ unique_passwords_1.append(passwords1[i])
20
+ unique_passwords_2.append(passwords2[i])
21
+
22
+ return common_passwords, unique_passwords_1, unique_passwords_2
23
+
24
+ def save_passwords(file_path, password_list):
25
+ """Save the list of passwords to a file."""
26
+ with open(file_path, 'w', encoding='latin1') as file:
27
+ for password in password_list:
28
+ file.write(password + '\n')
29
+
30
+ def main():
31
+ # Define the paths to the input files and output files
32
+ # 1st Run
33
+ file_path1 = 'orig_4k.txt'
34
+ file_path2 = 'de_4k.txt'
35
+ untranslated_file = 'untranslated.txt'
36
+ orig_translated_file = 'orig_translated.txt'
37
+ trans_translated_file = 'trans_translated.txt'
38
+
39
+ # 2nd Run
40
+ # file_path1 = 'untranslated.txt'
41
+ # file_path2 = 're_translated.txt'
42
+ # untranslated_file = 'untranslated2.txt'
43
+ # orig_translated_file = 'orig_translated2.txt'
44
+ # trans_translated_file = 'trans_translated2.txt'
45
+
46
+ # 3rd Run
47
+ # file_path1 = 'untranslated2.txt'
48
+ # file_path2 = 're_translated2.txt'
49
+ # untranslated_file = 'untranslated3.txt'
50
+ # orig_translated_file = 'orig_translated3.txt'
51
+ # trans_translated_file = 'trans_translated3.txt'
52
+
53
+ # Compare passwords and get the lists
54
+ common_passwords, unique_passwords_1, unique_passwords_2 = compare_passwords(file_path1, file_path2)
55
+
56
+ # Save the resulting lists to files
57
+ save_passwords(untranslated_file, common_passwords)
58
+ save_passwords(orig_translated_file, unique_passwords_1)
59
+ save_passwords(trans_translated_file, unique_passwords_2)
60
+
61
+ if __name__ == "__main__":
62
+ main()
translate_final.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Translate a password dictionary using a finetuned model."""
2
+
3
+ from unsloth import FastLanguageModel
4
+ import torch
5
+ import argparse
6
+ import tqdm
7
+
8
+ max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
9
+ dtype = torch.float16 # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
10
+ load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
11
+
12
+ model, tokenizer = FastLanguageModel.from_pretrained(
13
+ model_name = "unsloth/llama-3-8b-bnb-4bit",
14
+ max_seq_length = max_seq_length,
15
+ dtype = dtype,
16
+ load_in_4bit = load_in_4bit,
17
+ )
18
+
19
+ model = FastLanguageModel.get_peft_model(
20
+ model,
21
+ r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
22
+ target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
23
+ "gate_proj", "up_proj", "down_proj",],
24
+ lora_alpha = 16,
25
+ lora_dropout = 0, # Supports any, but = 0 is optimized
26
+ bias = "none", # Supports any, but = "none" is optimized
27
+ # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
28
+ use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
29
+ random_state = 3407,
30
+ use_rslora = False, # We support rank stabilized LoRA
31
+ loftq_config = None, # And LoftQ
32
+ )
33
+
34
+ import re
35
+ def extract_response(text):
36
+ # Define a regular expression to find the content after "### Response:"
37
+ # Using non-greedy matching to stop at the first potential end-of-text token or excessive newlines
38
+ match = re.search(r"### Response:\n(.*?)$", text, re.DOTALL)
39
+ if match:
40
+ response = match.group(1)
41
+ response = response.replace("<|end_of_text|>", "")
42
+ if response[-1] != "\n":
43
+ response = response + "\n"
44
+ return response
45
+ else:
46
+ raise "No response found in the text."
47
+
48
+ from unsloth import FastLanguageModel
49
+ model, tokenizer = FastLanguageModel.from_pretrained(
50
+ model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
51
+ max_seq_length = max_seq_length,
52
+ dtype = dtype,
53
+ load_in_4bit = load_in_4bit,
54
+ )
55
+ FastLanguageModel.for_inference(model) # Enable native 2x faster inference
56
+ tokenizer.padding_side = "left"
57
+ alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
58
+
59
+ ### Instruction:
60
+ {}
61
+
62
+ ### Input:
63
+ {}
64
+
65
+ ### Response:
66
+ {}"""
67
+
68
+ def process_batch(batch):
69
+ inputs = []
70
+ chunk_size = 10 # You can adjust the chunk size based on your needs
71
+ for i in range(0, len(batch), chunk_size):
72
+ chunk = ''.join(batch[i:i+chunk_size])
73
+ inputs.append(alpaca_prompt.format(
74
+ "Translate this passwords while keeping the original format.", # instruction
75
+ chunk, # input
76
+ "", # output - leave this blank for generation!
77
+ ))
78
+
79
+ input_tokens = tokenizer(inputs, return_tensors = "pt", padding=True).to("cuda")
80
+ outputs = model.generate(**input_tokens, max_new_tokens = 64, use_cache = True)
81
+ return tokenizer.batch_decode(outputs).map(extract_response)
82
+
83
+ BATCH_SIZE = 1000
84
+
85
+ def process_file(infile, outfile):
86
+ try:
87
+ with open(infile, 'r', encoding='latin1') as file:
88
+ lines = file.readlines()
89
+
90
+ translated_lines = []
91
+
92
+ # use tqdm for progress bar
93
+ for i in tqdm(range(0, len(lines), BATCH_SIZE)):
94
+ translated_batch = process_batch(lines[i:i+BATCH_SIZE])
95
+ translated_lines.extend(translated_batch)
96
+
97
+ # Write the translated text to another file
98
+ with open(outfile, 'w', encoding='utf-8') as file:
99
+ file.writelines(translated_lines)
100
+
101
+ except FileNotFoundError:
102
+ print("The input file was not found.")
103
+
104
+ def main():
105
+ parser = argparse.ArgumentParser(description="Translate text file content to German.")
106
+ parser.add_argument("-i", "--input_file", required=True, help="Path to the input text file")
107
+ parser.add_argument("-o", "--output_file", required=True, help="Path to the output text file where translated text will be saved")
108
+
109
+ args = parser.parse_args()
110
+
111
+ process_file(args.input_file, args.output_file)
112
+
113
+ if __name__ == "__main__":
114
+ main()
translate_oai.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Script to use the OpenAI API to translate passwords from English to German. Used to build a starting dataset for the password translation task."""
2
+ from openai import OpenAI
3
+
4
+ client = OpenAI()
5
+
6
+ import argparse
7
+
8
+ SYSTEM_PROMPT = """Translate the following password list to German. RESPECT the original casing even when it is grammatically incorrect. Don't add spaces or separators between words if they are not in the original. Respond only with the translated words one per line, nothing else.
9
+ Words:
10
+ password
11
+ iloveyou
12
+ princess
13
+ rockyou
14
+ abc123
15
+ nicole
16
+ loveyou
17
+
18
+ Translations:
19
+ passwort
20
+ ichliebedich
21
+ prinzessin
22
+ rockdich
23
+ abc123
24
+ nicole
25
+ liebedich
26
+ """
27
+
28
+ TRANSLATE_PROMPT = """Words:
29
+ <<INPUT>>
30
+
31
+ Translations:
32
+ """
33
+
34
+ def translate_to_german(text):
35
+ """Translate English text to German using the OpenAI API."""
36
+ chat_response = client.chat.completions.create(model="gpt-4",
37
+ messages=[
38
+ {"role": "system", "content": SYSTEM_PROMPT},
39
+ {"role": "user", "content": TRANSLATE_PROMPT.replace("<<INPUT>>", text)}
40
+ ])
41
+ response = chat_response.choices[0].message.content
42
+ if response[-1] != "\n":
43
+ response += "\n"
44
+ return response
45
+
46
+
47
+ def process_file(input_file_path, output_file_path):
48
+ """Process the file in chunks and translate each chunk."""
49
+ try:
50
+ with open(input_file_path, 'r', encoding='latin1') as file:
51
+ lines = file.readlines()
52
+
53
+ translated_lines = []
54
+ chunk_size = 10 # You can adjust the chunk size based on your needs
55
+
56
+ # Process the file in chunks
57
+ for i in range(0, len(lines), chunk_size):
58
+ chunk = ''.join(lines[i:i+chunk_size])
59
+ print("SENT", chunk) # Debug print to trace what is sent for translation
60
+ translated_chunk = translate_to_german(chunk)
61
+ print("GOT", translated_chunk) # Debug print to see the translation
62
+ translated_lines.append(translated_chunk)
63
+
64
+ # Write the translated text to another file
65
+ with open(output_file_path, 'w', encoding='utf-8') as file:
66
+ file.writelines(translated_lines)
67
+
68
+ except FileNotFoundError:
69
+ print("The input file was not found.")
70
+
71
+
72
+ def main():
73
+ parser = argparse.ArgumentParser(description="Translate text file content to German.")
74
+ parser.add_argument("-i", "--input_file", required=True, help="Path to the input text file")
75
+ parser.add_argument("-o", "--output_file", required=True, help="Path to the output text file where translated text will be saved")
76
+
77
+ args = parser.parse_args()
78
+
79
+ process_file(args.input_file, args.output_file)
80
+
81
+ if __name__ == "__main__":
82
+ main()
translate_ollama.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Original code for testing translation with Ollama. Results were not of required quality."""
2
+ import ollama
3
+ import argparse
4
+
5
+ SYSTEM_PROMPT = """Translate the following password list to German. RESPECT the original casing even when it is gramatically incorrect. Don't add spaces or separators between words if they are not in the original. Respond only with the translated words one per line, nothing else.
6
+ Words:
7
+ password
8
+ iloveyou
9
+ princess
10
+ rockyou
11
+ abc123
12
+ nicole
13
+ loveyou
14
+
15
+ Translations:
16
+ passwort
17
+ ichliebedich
18
+ prinzessin
19
+ rockdich
20
+ abc123
21
+ nicole
22
+ liebedich
23
+ """
24
+
25
+ TRANSLATE_PROMPT = """Words:
26
+ <<INPUT>>
27
+
28
+ Translations:
29
+ """
30
+ def translate_to_german(text):
31
+ """Translate English text to German using the Ollama model."""
32
+ response = ollama.chat(
33
+ model='llama3',
34
+ messages=[
35
+ {"role":"system", "content":SYSTEM_PROMPT},
36
+ {
37
+ 'role': 'user',
38
+ 'content': TRANSLATE_PROMPT.replace("<<INPUT>>", text)
39
+ },
40
+ ]
41
+ )
42
+ return response['message']['content'] if 'message' in response and 'content' in response['message'] else ''
43
+
44
+ def process_file(input_file_path, output_file_path):
45
+ """Process the file in chunks and translate each chunk."""
46
+ try:
47
+ with open(input_file_path, 'r', encoding='latin1') as file:
48
+ lines = file.readlines()
49
+
50
+ translated_lines = []
51
+ chunk_size = 10
52
+
53
+ # Process the file in chunks of 10 lines
54
+ for i in range(0, len(lines), chunk_size):
55
+ chunk = ''.join(lines[i:i+chunk_size])
56
+ print("SENT",chunk)
57
+ translated_chunk = translate_to_german(chunk)
58
+ print("GOT",translated_chunk)
59
+ translated_lines.append(translated_chunk)
60
+
61
+ # Write the translated text to another file
62
+ with open(output_file_path, 'w', encoding='utf-8') as file:
63
+ file.writelines(translated_lines)
64
+
65
+ except FileNotFoundError:
66
+ print("The input file was not found.")
67
+
68
+
69
+ def main():
70
+ parser = argparse.ArgumentParser(description="Translate text file content to German.")
71
+ parser.add_argument("-i", "--input_file", required=True, help="Path to the input text file")
72
+ parser.add_argument("-o", "--output_file", required=True, help="Path to the output text file where translated text will be saved")
73
+
74
+ args = parser.parse_args()
75
+
76
+ process_file(args.input_file, args.output_file)
77
+
78
+ if __name__ == "__main__":
79
+ main()