OferB commited on
Commit
df3919e
1 Parent(s): ff0df0b

Add throughput benchmark example

Browse files
Files changed (1) hide show
  1. hf_benchmark_example.py +212 -0
hf_benchmark_example.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ cmd example
3
+ You need a file called "sample.txt" (default path) with text to take tokens for prompts or supply --text_file "path/to/text.txt" as an argument to a text file.
4
+ You can use our attached "sample.txt" file with one of Deci's blogs as a prompt.
5
+ # Run this and record tokens per second (652 tokens per second on A10 for DeciLM-6b)
6
+ python time_hf.py --model Deci/DeciLM-6b-instruct
7
+ # Run this and record tokens per second (136 tokens per second on A10 for meta-llama/Llama-2-7b-hf), CUDA OOM above batch size 8
8
+ python time_hf.py --model meta-llama/Llama-2-7b-hf --batch_size 8
9
+ """
10
+
11
+ import json
12
+
13
+ import datasets
14
+ import torch
15
+ import transformers
16
+ from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser
17
+ from argparse import ArgumentParser
18
+
19
+
20
+ def parse_args():
21
+ parser = ArgumentParser()
22
+
23
+ parser.add_argument(
24
+ "--model",
25
+ required=True,
26
+ help="Model to evaluate, provide a repo name in Hugging Face hub or a local path",
27
+ )
28
+ parser.add_argument(
29
+ "--temperature",
30
+ default=0.2,
31
+ type=float
32
+ )
33
+ parser.add_argument(
34
+ "--top_p",
35
+ default=0.95,
36
+ type=float
37
+ )
38
+ parser.add_argument(
39
+ "--top_k",
40
+ default=0,
41
+ type=float
42
+ )
43
+
44
+ parser.add_argument(
45
+ "--revision",
46
+ default=None,
47
+ help="Model revision to use",
48
+ )
49
+ parser.add_argument(
50
+ "--iterations",
51
+ type=int,
52
+ default=6,
53
+ help="Model revision to use",
54
+ )
55
+ parser.add_argument(
56
+ "--batch_size",
57
+ type=int,
58
+ default=64,
59
+ help="Batch size for evaluation on each worker, can be larger for HumanEval",
60
+
61
+ )
62
+ parser.add_argument(
63
+ "--prompt_length",
64
+ type=int,
65
+ default=512,
66
+ )
67
+ parser.add_argument(
68
+ "--max_new_tokens",
69
+ type=int,
70
+ default=512,
71
+ help="Maximum length of generated sequence (prompt+generation)",
72
+ )
73
+ parser.add_argument(
74
+ "--precision",
75
+ type=str,
76
+ default="bf16",
77
+ help="Model precision, from: fp32, fp16 or bf16",
78
+ )
79
+ parser.add_argument(
80
+ "--text_file",
81
+ type=str,
82
+ default="sample.txt",
83
+ help="text file that will be used to generate tokens for prompts",
84
+ )
85
+ parser.add_argument(
86
+ "--load_in_8bit",
87
+ action="store_true",
88
+ help="Load model in 8bit",
89
+ )
90
+ parser.add_argument(
91
+ "--load_in_4bit",
92
+ action="store_true",
93
+ help="Load model in 4bit",
94
+ )
95
+ return parser.parse_args()
96
+
97
+
98
+ def main():
99
+ args = parse_args()
100
+ transformers.logging.set_verbosity_error()
101
+ datasets.logging.set_verbosity_error()
102
+
103
+
104
+ results = {}
105
+ dict_precisions = {
106
+ "fp32": torch.float32,
107
+ "fp16": torch.float16,
108
+ "bf16": torch.bfloat16,
109
+ }
110
+ if args.precision not in dict_precisions:
111
+ raise ValueError(
112
+ f"Non valid precision {args.precision}, choose from: fp16, fp32, bf16"
113
+ )
114
+ if args.load_in_8bit:
115
+ print("Loading model in 8bit")
116
+ # the model needs to fit in one GPU
117
+ model = AutoModelForCausalLM.from_pretrained(
118
+ args.model,
119
+ revision=args.revision,
120
+ load_in_8bit=args.load_in_8bit,
121
+ trust_remote_code=args.trust_remote_code,
122
+ use_auth_token=args.use_auth_token,
123
+ device_map={"": 'cuda'},
124
+ )
125
+ elif args.load_in_4bit:
126
+ print("Loading model in 4bit")
127
+ # the model needs to fit in one GPU
128
+ model = AutoModelForCausalLM.from_pretrained(
129
+ args.model,
130
+ revision=args.revision,
131
+ load_in_4bit=args.load_in_4bit,
132
+ trust_remote_code=args.trust_remote_code,
133
+ use_auth_token=args.use_auth_token,
134
+ device_map={"": 'cuda'},
135
+ )
136
+ else:
137
+ print(f"Loading model in {args.precision}")
138
+ model = AutoModelForCausalLM.from_pretrained(
139
+ args.model,
140
+ torch_dtype=torch.bfloat16,
141
+ trust_remote_code=True,
142
+ use_auth_token=True
143
+ )
144
+
145
+ tokenizer = AutoTokenizer.from_pretrained(
146
+ args.model,
147
+ revision=args.revision,
148
+ trust_remote_code=True,
149
+ use_auth_token=True,
150
+ )
151
+
152
+ starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
153
+ model.cuda()
154
+ model.eval()
155
+
156
+ with open(args.text_file, "r") as f:
157
+ prompt = f.read()
158
+
159
+ prompt = torch.tensor(tokenizer.encode(prompt))[:args.prompt_length].cuda()
160
+
161
+ results = {'prefill': [], 'gen': [], 'max_new_tokens': args.max_new_tokens, 'prompt_length': args.prompt_length, 'model': args.model, 'batch_size': args.batch_size}
162
+ inputs = prompt.repeat(args.batch_size, 1)
163
+
164
+ #warmup
165
+ print('start warmup')
166
+ for _ in range(10):
167
+ with torch.no_grad():
168
+ _ = model.generate(
169
+ input_ids=inputs,
170
+ max_new_tokens=1,
171
+ do_sample=False,
172
+ )
173
+ print('finish warmup')
174
+ torch.cuda.synchronize()
175
+
176
+ for prefill_iter in range(args.iterations):
177
+ starter.record()
178
+ with torch.no_grad():
179
+ _ = model.generate(
180
+ input_ids=inputs,
181
+ max_new_tokens=1,
182
+ do_sample=False,
183
+ )
184
+ ender.record()
185
+ torch.cuda.synchronize()
186
+ t = starter.elapsed_time(ender) / 1000
187
+ results['prefill'].append(t)
188
+ print(f'{args.batch_size} prefill iter {prefill_iter} took: {t}')
189
+
190
+
191
+ for gen_iter in range(args.iterations):
192
+ starter.record()
193
+ with torch.no_grad():
194
+ _ = model.generate(
195
+ input_ids=inputs,
196
+ max_new_tokens=args.max_new_tokens,
197
+ do_sample=False,
198
+ )
199
+ ender.record()
200
+ torch.cuda.synchronize()
201
+ t = starter.elapsed_time(ender) / 1000
202
+ results['gen'].append(t)
203
+
204
+ print(f'{args.batch_size} total generation iter {gen_iter} took: {t}')
205
+ print(f'{args.batch_size * args.max_new_tokens / t} tokens per seconds')
206
+ model_str = args.model.split('/')[-1]
207
+ with open(f'timing_{model_str}_{args.batch_size}.json', 'w') as f:
208
+ json.dump(results, f)
209
+
210
+
211
+ if __name__ == "__main__":
212
+ main()