(ZeroGPU) Avoid re-loading model when possible

#1
by cbensimon HF staff - opened
Files changed (1) hide show
  1. app.py +16 -7
app.py CHANGED
@@ -24,6 +24,9 @@ hf_hub_download(
24
  )
25
 
26
 
 
 
 
27
  @spaces.GPU(duration=120)
28
  def respond(
29
  message,
@@ -38,13 +41,19 @@ def respond(
38
  ):
39
  chat_template = MessagesFormatterType.GEMMA_2
40
 
41
- llm = Llama(
42
- model_path=f"models/{model}",
43
- flash_attn=True,
44
- n_gpu_layers=81,
45
- n_batch=1024,
46
- n_ctx=8192,
47
- )
 
 
 
 
 
 
48
  provider = LlamaCppPythonProvider(llm)
49
 
50
  agent = LlamaCppAgent(
 
24
  )
25
 
26
 
27
+ llm = None
28
+ llm_model = None
29
+
30
  @spaces.GPU(duration=120)
31
  def respond(
32
  message,
 
41
  ):
42
  chat_template = MessagesFormatterType.GEMMA_2
43
 
44
+ global llm
45
+ global llm_model
46
+
47
+ if llm is None or llm_model != model:
48
+ llm = Llama(
49
+ model_path=f"models/{model}",
50
+ flash_attn=True,
51
+ n_gpu_layers=81,
52
+ n_batch=1024,
53
+ n_ctx=8192,
54
+ )
55
+ llm_model = model
56
+
57
  provider = LlamaCppPythonProvider(llm)
58
 
59
  agent = LlamaCppAgent(