lazy lora for llama2-7bhf

Browse files

Files changed (4) hide show

README.md +72 -0
adapter_config.json +260 -0
adapter_model.bin +3 -0
usage.py +51 -0

README.md CHANGED Viewed

@@ -1,3 +1,75 @@
 ---
 license: llama2
 ---

 ---
 license: llama2
 ---
+##Lazy LoRA
+Determine the rank of LoRA layers by the singular values of pretrained weight matrices.
+Also, combines:
+1. LoRA: [LORA: LOW-RANK ADAPTATION OF LARGE LANGUAGE MODELS](https://arxiv.org/abs/2106.09685)
+2. Prefix Tuning: [Prefix-Tuning: Optimizing Continuous Prompts for Generation](https://aclanthology.org/2021.acl-long.3
+53/), [P-Tuning v2: Prompt Tuning Can Be Comparable to Fine-tuning Universally Across Scales and Tasks](https://arxiv.or
+g/pdf/2110.07602.pdf)
+3. Prompt Tuning: [The Power of Scale for Parameter-Efficient Prompt Tuning](https://arxiv.org/abs/2104.08691)
+4. LLaMA adapter: [] ()
+in one model.
+This allows you to perform LoRA (additional low rank adapters inserted to each linear layer), and prompt learning (additional virtual tokens attached to the input and to the attention layers acting as `past_key_values`)
+##Usage:
+```python
+import sys
+sys.path.insert(1, '/workspace/asr/peft/src')
+# TODO set this path to the lazy-lora source code path, or you can install it from source code:
+# TODO, please install lazylora for usage:
+# git clone [email protected]:Xianchao-Wu/peft.git
+# cd peft
+# python setup.py install
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+from peft import PeftModel, PeftConfig
+import os
+import torch
+#import ipdb; ipdb.set_trace()
+cache_dir="/workspace/asr/peft/qlora"
+# TODO set this cache_dir to the path where you stored (or, want to store) llama2-7bhf model
+lazylora_dir=os.getcwd() # the path that contains 'adapter_config.json' and 'adapter_model.bin'
+config = PeftConfig.from_pretrained(lazylora_dir)
+tokenizer = AutoTokenizer.from_pretrained(
+    config.base_model_name_or_path,
+    cache_dir=cache_dir,
+    use_auth_token=True
+)
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_quant_type='nf4',
+    bnb_4bit_compute_dtype=torch.bfloat16
+)
+model = AutoModelForCausalLM.from_pretrained(
+    config.base_model_name_or_path,
+    quantization_config=bnb_config,
+    device_map="auto",
+    cache_dir=cache_dir,
+    use_auth_token=True
+)
+#model.print_trainable_parameters()
+print(sum(p.numel() for p in model.parameters()))
+# 3,500,412,928 -> half-size of 7B due to 4-bit loading
+model = PeftModel.from_pretrained(model, lazylora_dir)
+print('after adding lazy lora parameters:')
+model.print_trainable_parameters()
+# trainable params: 0 || all params: 3,660,359,168 || trainable%: 0.0
+```
+##MMLU result:
+{'mmlu_loss': 1.8361594152170253, 'mmlu_eval_accuracy_us_foreign_policy': 0.6363636363636364, 'mmlu_eval_accuracy_world_religions': 0.7368421052631579, 'mmlu_eval_accuracy_high_school_us_history': 0.6363636363636364, 'mmlu_eval_accuracy_high_school_psychology': 0.6166666666666667, 'mmlu_eval_accuracy_public_relations': 0.3333333333333333, 'mmlu_eval_accuracy_high_school_european_history': 0.6666666666666666, 'mmlu_eval_accuracy_econometrics': 0.16666666666666666, 'mmlu_eval_accuracy_high_school_microeconomics': 0.34615384615384615, 'mmlu_eval_accuracy_machine_learning': 0.18181818181818182, 'mmlu_eval_accuracy_high_school_mathematics': 0.3448275862068966, 'mmlu_eval_accuracy_high_school_computer_science': 0.5555555555555556, 'mmlu_eval_accuracy_professional_accounting': 0.3548387096774194, 'mmlu_eval_accuracy_high_school_world_history': 0.5, 'mmlu_eval_accuracy_marketing': 0.72, 'mmlu_eval_accuracy_sociology': 0.7272727272727273, 'mmlu_eval_accuracy_nutrition': 0.5454545454545454, 'mmlu_eval_accuracy_high_school_chemistry': 0.4090909090909091, 'mmlu_eval_accuracy_logical_fallacies': 0.5555555555555556, 'mmlu_eval_accuracy_college_mathematics': 0.18181818181818182, 'mmlu_eval_accuracy_computer_security': 0.2727272727272727, 'mmlu_eval_accuracy_miscellaneous': 0.6046511627906976, 'mmlu_eval_accuracy_high_school_statistics': 0.2608695652173913, 'mmlu_eval_accuracy_philosophy': 0.4117647058823529, 'mmlu_eval_accuracy_global_facts': 0.4, 'mmlu_eval_accuracy_management': 0.2727272727272727, 'mmlu_eval_accuracy_human_aging': 0.6956521739130435, 'mmlu_eval_accuracy_moral_scenarios': 0.25, 'mmlu_eval_accuracy_human_sexuality': 0.5, 'mmlu_eval_accuracy_abstract_algebra': 0.36363636363636365, 'mmlu_eval_accuracy_high_school_macroeconomics': 0.3488372093023256, 'mmlu_eval_accuracy_electrical_engineering': 0.375, 'mmlu_eval_accuracy_professional_medicine': 0.45161290322580644, 'mmlu_eval_accuracy_high_school_government_and_politics': 0.6666666666666666, 'mmlu_eval_accuracy_high_school_biology': 0.3125, 'mmlu_eval_accuracy_astronomy': 0.4375, 'mmlu_eval_accuracy_security_studies': 0.4074074074074074, 'mmlu_eval_accuracy_prehistory': 0.42857142857142855, 'mmlu_eval_accuracy_conceptual_physics': 0.3076923076923077, 'mmlu_eval_accuracy_college_medicine': 0.36363636363636365, 'mmlu_eval_accuracy_moral_disputes': 0.39473684210526316, 'mmlu_eval_accuracy_anatomy': 0.5, 'mmlu_eval_accuracy_clinical_knowledge': 0.41379310344827586, 'mmlu_eval_accuracy_college_computer_science': 0.5454545454545454, 'mmlu_eval_accuracy_high_school_geography': 0.5909090909090909, 'mmlu_eval_accuracy_college_chemistry': 0.125, 'mmlu_eval_accuracy_professional_psychology': 0.36231884057971014, 'mmlu_eval_accuracy_virology': 0.4444444444444444, 'mmlu_eval_accuracy_international_law': 0.8461538461538461, 'mmlu_eval_accuracy_medical_genetics': 0.8181818181818182, 'mmlu_eval_accuracy_formal_logic': 0.14285714285714285, 'mmlu_eval_accuracy_professional_law': 0.34705882352941175, 'mmlu_eval_accuracy_college_biology': 0.25, 'mmlu_eval_accuracy_jurisprudence': 0.45454545454545453, 'mmlu_eval_accuracy_business_ethics': 0.5454545454545454, 'mmlu_eval_accuracy_college_physics': 0.5454545454545454, 'mmlu_eval_accuracy_high_school_physics': 0.29411764705882354, 'mmlu_eval_accuracy_elementary_mathematics': 0.3170731707317073, 'mmlu_eval_accuracy': 0.4435841258637352, 'epoch': 1.36}

adapter_config.json ADDED Viewed

	@@ -0,0 +1,260 @@

+{
+  "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lazy_lora_weights": true,
+  "is_r_by_svd": true,
+  "is_r_reuse": true,
+  "lazy_lora_alpha": 16.0,
+  "lazy_lora_dropout": 0.05,
+  "lazy_pre_adapter_type": "none",
+  "lazy_pre_lora_alpha": 0.1,
+  "modules_to_save": null,
+  "num_attention_heads": 32,
+  "num_layers": 32,
+  "num_transformer_submodules": 1,
+  "num_virtual_tokens": null,
+  "peft_type": "LAZY_LORA",
+  "prefix_tuning_config": null,
+  "prompt_tuning_config": null,
+  "r": 64,
+  "r_by_module_dict": {
+    "model.layers.0.mlp.down_proj": 58,
+    "model.layers.0.mlp.gate_proj": 50,
+    "model.layers.0.mlp.up_proj": 52,
+    "model.layers.0.self_attn.k_proj": 17,
+    "model.layers.0.self_attn.o_proj": 24,
+    "model.layers.0.self_attn.q_proj": 15,
+    "model.layers.0.self_attn.v_proj": 36,
+    "model.layers.1.mlp.down_proj": 62,
+    "model.layers.1.mlp.gate_proj": 55,
+    "model.layers.1.mlp.up_proj": 58,
+    "model.layers.1.self_attn.k_proj": 46,
+    "model.layers.1.self_attn.o_proj": 30,
+    "model.layers.1.self_attn.q_proj": 48,
+    "model.layers.1.self_attn.v_proj": 36,
+    "model.layers.10.mlp.down_proj": 62,
+    "model.layers.10.mlp.gate_proj": 63,
+    "model.layers.10.mlp.up_proj": 63,
+    "model.layers.10.self_attn.k_proj": 69,
+    "model.layers.10.self_attn.o_proj": 55,
+    "model.layers.10.self_attn.q_proj": 68,
+    "model.layers.10.self_attn.v_proj": 54,
+    "model.layers.11.mlp.down_proj": 63,
+    "model.layers.11.mlp.gate_proj": 62,
+    "model.layers.11.mlp.up_proj": 64,
+    "model.layers.11.self_attn.k_proj": 63,
+    "model.layers.11.self_attn.o_proj": 56,
+    "model.layers.11.self_attn.q_proj": 63,
+    "model.layers.11.self_attn.v_proj": 56,
+    "model.layers.12.mlp.down_proj": 63,
+    "model.layers.12.mlp.gate_proj": 62,
+    "model.layers.12.mlp.up_proj": 64,
+    "model.layers.12.self_attn.k_proj": 68,
+    "model.layers.12.self_attn.o_proj": 57,
+    "model.layers.12.self_attn.q_proj": 67,
+    "model.layers.12.self_attn.v_proj": 56,
+    "model.layers.13.mlp.down_proj": 64,
+    "model.layers.13.mlp.gate_proj": 62,
+    "model.layers.13.mlp.up_proj": 65,
+    "model.layers.13.self_attn.k_proj": 68,
+    "model.layers.13.self_attn.o_proj": 59,
+    "model.layers.13.self_attn.q_proj": 67,
+    "model.layers.13.self_attn.v_proj": 60,
+    "model.layers.14.mlp.down_proj": 64,
+    "model.layers.14.mlp.gate_proj": 62,
+    "model.layers.14.mlp.up_proj": 65,
+    "model.layers.14.self_attn.k_proj": 65,
+    "model.layers.14.self_attn.o_proj": 58,
+    "model.layers.14.self_attn.q_proj": 65,
+    "model.layers.14.self_attn.v_proj": 58,
+    "model.layers.15.mlp.down_proj": 65,
+    "model.layers.15.mlp.gate_proj": 63,
+    "model.layers.15.mlp.up_proj": 65,
+    "model.layers.15.self_attn.k_proj": 67,
+    "model.layers.15.self_attn.o_proj": 61,
+    "model.layers.15.self_attn.q_proj": 66,
+    "model.layers.15.self_attn.v_proj": 61,
+    "model.layers.16.mlp.down_proj": 65,
+    "model.layers.16.mlp.gate_proj": 63,
+    "model.layers.16.mlp.up_proj": 65,
+    "model.layers.16.self_attn.k_proj": 66,
+    "model.layers.16.self_attn.o_proj": 65,
+    "model.layers.16.self_attn.q_proj": 65,
+    "model.layers.16.self_attn.v_proj": 65,
+    "model.layers.17.mlp.down_proj": 65,
+    "model.layers.17.mlp.gate_proj": 64,
+    "model.layers.17.mlp.up_proj": 65,
+    "model.layers.17.self_attn.k_proj": 67,
+    "model.layers.17.self_attn.o_proj": 65,
+    "model.layers.17.self_attn.q_proj": 67,
+    "model.layers.17.self_attn.v_proj": 65,
+    "model.layers.18.mlp.down_proj": 65,
+    "model.layers.18.mlp.gate_proj": 64,
+    "model.layers.18.mlp.up_proj": 65,
+    "model.layers.18.self_attn.k_proj": 67,
+    "model.layers.18.self_attn.o_proj": 69,
+    "model.layers.18.self_attn.q_proj": 67,
+    "model.layers.18.self_attn.v_proj": 68,
+    "model.layers.19.mlp.down_proj": 65,
+    "model.layers.19.mlp.gate_proj": 65,
+    "model.layers.19.mlp.up_proj": 65,
+    "model.layers.19.self_attn.k_proj": 64,
+    "model.layers.19.self_attn.o_proj": 69,
+    "model.layers.19.self_attn.q_proj": 65,
+    "model.layers.19.self_attn.v_proj": 68,
+    "model.layers.2.mlp.down_proj": 63,
+    "model.layers.2.mlp.gate_proj": 60,
+    "model.layers.2.mlp.up_proj": 60,
+    "model.layers.2.self_attn.k_proj": 65,
+    "model.layers.2.self_attn.o_proj": 56,
+    "model.layers.2.self_attn.q_proj": 64,
+    "model.layers.2.self_attn.v_proj": 55,
+    "model.layers.20.mlp.down_proj": 66,
+    "model.layers.20.mlp.gate_proj": 65,
+    "model.layers.20.mlp.up_proj": 65,
+    "model.layers.20.self_attn.k_proj": 65,
+    "model.layers.20.self_attn.o_proj": 71,
+    "model.layers.20.self_attn.q_proj": 65,
+    "model.layers.20.self_attn.v_proj": 70,
+    "model.layers.21.mlp.down_proj": 66,
+    "model.layers.21.mlp.gate_proj": 66,
+    "model.layers.21.mlp.up_proj": 65,
+    "model.layers.21.self_attn.k_proj": 64,
+    "model.layers.21.self_attn.o_proj": 73,
+    "model.layers.21.self_attn.q_proj": 64,
+    "model.layers.21.self_attn.v_proj": 71,
+    "model.layers.22.mlp.down_proj": 66,
+    "model.layers.22.mlp.gate_proj": 66,
+    "model.layers.22.mlp.up_proj": 65,
+    "model.layers.22.self_attn.k_proj": 66,
+    "model.layers.22.self_attn.o_proj": 73,
+    "model.layers.22.self_attn.q_proj": 66,
+    "model.layers.22.self_attn.v_proj": 72,
+    "model.layers.23.mlp.down_proj": 66,
+    "model.layers.23.mlp.gate_proj": 66,
+    "model.layers.23.mlp.up_proj": 65,
+    "model.layers.23.self_attn.k_proj": 67,
+    "model.layers.23.self_attn.o_proj": 77,
+    "model.layers.23.self_attn.q_proj": 68,
+    "model.layers.23.self_attn.v_proj": 76,
+    "model.layers.24.mlp.down_proj": 66,
+    "model.layers.24.mlp.gate_proj": 67,
+    "model.layers.24.mlp.up_proj": 66,
+    "model.layers.24.self_attn.k_proj": 62,
+    "model.layers.24.self_attn.o_proj": 76,
+    "model.layers.24.self_attn.q_proj": 63,
+    "model.layers.24.self_attn.v_proj": 75,
+    "model.layers.25.mlp.down_proj": 66,
+    "model.layers.25.mlp.gate_proj": 67,
+    "model.layers.25.mlp.up_proj": 66,
+    "model.layers.25.self_attn.k_proj": 65,
+    "model.layers.25.self_attn.o_proj": 79,
+    "model.layers.25.self_attn.q_proj": 66,
+    "model.layers.25.self_attn.v_proj": 78,
+    "model.layers.26.mlp.down_proj": 66,
+    "model.layers.26.mlp.gate_proj": 67,
+    "model.layers.26.mlp.up_proj": 66,
+    "model.layers.26.self_attn.k_proj": 63,
+    "model.layers.26.self_attn.o_proj": 80,
+    "model.layers.26.self_attn.q_proj": 63,
+    "model.layers.26.self_attn.v_proj": 79,
+    "model.layers.27.mlp.down_proj": 66,
+    "model.layers.27.mlp.gate_proj": 67,
+    "model.layers.27.mlp.up_proj": 67,
+    "model.layers.27.self_attn.k_proj": 68,
+    "model.layers.27.self_attn.o_proj": 81,
+    "model.layers.27.self_attn.q_proj": 68,
+    "model.layers.27.self_attn.v_proj": 80,
+    "model.layers.28.mlp.down_proj": 67,
+    "model.layers.28.mlp.gate_proj": 67,
+    "model.layers.28.mlp.up_proj": 67,
+    "model.layers.28.self_attn.k_proj": 65,
+    "model.layers.28.self_attn.o_proj": 83,
+    "model.layers.28.self_attn.q_proj": 66,
+    "model.layers.28.self_attn.v_proj": 82,
+    "model.layers.29.mlp.down_proj": 68,
+    "model.layers.29.mlp.gate_proj": 67,
+    "model.layers.29.mlp.up_proj": 68,
+    "model.layers.29.self_attn.k_proj": 62,
+    "model.layers.29.self_attn.o_proj": 84,
+    "model.layers.29.self_attn.q_proj": 62,
+    "model.layers.29.self_attn.v_proj": 82,
+    "model.layers.3.mlp.down_proj": 62,
+    "model.layers.3.mlp.gate_proj": 63,
+    "model.layers.3.mlp.up_proj": 62,
+    "model.layers.3.self_attn.k_proj": 70,
+    "model.layers.3.self_attn.o_proj": 53,
+    "model.layers.3.self_attn.q_proj": 68,
+    "model.layers.3.self_attn.v_proj": 53,
+    "model.layers.30.mlp.down_proj": 67,
+    "model.layers.30.mlp.gate_proj": 68,
+    "model.layers.30.mlp.up_proj": 68,
+    "model.layers.30.self_attn.k_proj": 64,
+    "model.layers.30.self_attn.o_proj": 87,
+    "model.layers.30.self_attn.q_proj": 64,
+    "model.layers.30.self_attn.v_proj": 85,
+    "model.layers.31.mlp.down_proj": 67,
+    "model.layers.31.mlp.gate_proj": 71,
+    "model.layers.31.mlp.up_proj": 70,
+    "model.layers.31.self_attn.k_proj": 63,
+    "model.layers.31.self_attn.o_proj": 78,
+    "model.layers.31.self_attn.q_proj": 61,
+    "model.layers.31.self_attn.v_proj": 77,
+    "model.layers.4.mlp.down_proj": 61,
+    "model.layers.4.mlp.gate_proj": 64,
+    "model.layers.4.mlp.up_proj": 62,
+    "model.layers.4.self_attn.k_proj": 71,
+    "model.layers.4.self_attn.o_proj": 56,
+    "model.layers.4.self_attn.q_proj": 70,
+    "model.layers.4.self_attn.v_proj": 56,
+    "model.layers.5.mlp.down_proj": 61,
+    "model.layers.5.mlp.gate_proj": 64,
+    "model.layers.5.mlp.up_proj": 62,
+    "model.layers.5.self_attn.k_proj": 73,
+    "model.layers.5.self_attn.o_proj": 57,
+    "model.layers.5.self_attn.q_proj": 72,
+    "model.layers.5.self_attn.v_proj": 58,
+    "model.layers.6.mlp.down_proj": 61,
+    "model.layers.6.mlp.gate_proj": 65,
+    "model.layers.6.mlp.up_proj": 62,
+    "model.layers.6.self_attn.k_proj": 67,
+    "model.layers.6.self_attn.o_proj": 53,
+    "model.layers.6.self_attn.q_proj": 67,
+    "model.layers.6.self_attn.v_proj": 53,
+    "model.layers.7.mlp.down_proj": 61,
+    "model.layers.7.mlp.gate_proj": 65,
+    "model.layers.7.mlp.up_proj": 62,
+    "model.layers.7.self_attn.k_proj": 66,
+    "model.layers.7.self_attn.o_proj": 53,
+    "model.layers.7.self_attn.q_proj": 67,
+    "model.layers.7.self_attn.v_proj": 53,
+    "model.layers.8.mlp.down_proj": 61,
+    "model.layers.8.mlp.gate_proj": 64,
+    "model.layers.8.mlp.up_proj": 62,
+    "model.layers.8.self_attn.k_proj": 68,
+    "model.layers.8.self_attn.o_proj": 55,
+    "model.layers.8.self_attn.q_proj": 69,
+    "model.layers.8.self_attn.v_proj": 54,
+    "model.layers.9.mlp.down_proj": 62,
+    "model.layers.9.mlp.gate_proj": 63,
+    "model.layers.9.mlp.up_proj": 63,
+    "model.layers.9.self_attn.k_proj": 70,
+    "model.layers.9.self_attn.o_proj": 56,
+    "model.layers.9.self_attn.q_proj": 70,
+    "model.layers.9.self_attn.v_proj": 55
+  },
+  "rank_file": "",
+  "target_modules": [
+    "down_proj",
+    "v_proj",
+    "q_proj",
+    "o_proj",
+    "up_proj",
+    "gate_proj",
+    "k_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "token_dim": 4096
+}

adapter_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3aa846c06af3188a3d7b7c3e0a32ad4c1ecb48b1d02353610425c192c3ae4182
+size 320063949

usage.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import sys
+sys.path.insert(1, '/workspace/asr/peft/src')
+# TODO set this path to the lazy-lora source code path, or you can install it from source code:
+# TODO, please install lazylora for usage:
+# git clone [email protected]:Xianchao-Wu/peft.git
+# cd peft
+# python setup.py install
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+from peft import PeftModel, PeftConfig
+import os
+import torch
+#import ipdb; ipdb.set_trace()
+cache_dir="/workspace/asr/peft/qlora"
+# TODO set this cache_dir to the path where you stored (or, want to store) llama2-7bhf model
+lazylora_dir=os.getcwd() # the path that contains 'adapter_config.json' and 'adapter_model.bin'
+config = PeftConfig.from_pretrained(lazylora_dir)
+tokenizer = AutoTokenizer.from_pretrained(
+    config.base_model_name_or_path,
+    cache_dir=cache_dir,
+    use_auth_token=True
+)
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_quant_type='nf4',
+    bnb_4bit_compute_dtype=torch.bfloat16
+)
+model = AutoModelForCausalLM.from_pretrained(
+    config.base_model_name_or_path,
+    quantization_config=bnb_config,
+    device_map="auto",
+    cache_dir=cache_dir,
+    use_auth_token=True
+)
+#model.print_trainable_parameters()
+print(sum(p.numel() for p in model.parameters()))
+# 3,500,412,928 -> half-size of 7B due to 4-bit loading
+model = PeftModel.from_pretrained(model, lazylora_dir)
+print('after adding lazy lora parameters:')
+model.print_trainable_parameters()
+# trainable params: 0 || all params: 3,660,359,168 || trainable%: 0.0