From 1f27a9abeddde91058ae0a866bcb928e38de23dc Mon Sep 17 00:00:00 2001
From: zshobbs <zeke.hobbs@geospatial-insight.com>
Date: Wed, 25 Jan 2023 22:56:04 +0000
Subject: [PATCH 1/2] add HF accelerate for models

---
 README.md | 42 ++++++++++++++++++++++++++++++++
 app.py    | 72 +++++++++++++++++++++++++++++++++++++++----------------
 2 files changed, 94 insertions(+), 20 deletions(-)
diff --git a/README.md b/README.md
index 7431553..72f41a8 100644
--- a/README.md
+++ b/README.md
@@ -4,3 +4,45 @@ ChatGPT @ Home: Large Language Model (LLM) chatbot application, written by ChatG
 I asked ChatGPT to build an LLM-based chatbot app and this was the result. 
 
 <img src="https://pythonprogramming.net/static/images/chatgptathomesocial.png" width="512"/>
+
+### Running float-16 and 8 bit across multiple GPU's
+Hugging face accelerate lets you run LLM's over multiple models,
+For half precission across multiple gpus accelerate is only needed.
+
+Install hugging face [accelerate](https://huggingface.co/docs/accelerate/v0.3.0/installation.html)
+
+For 8 bit bitsandbytes is needed (needs Turing or Ampere GPUs.)
+
+Install [bitsandbytes](https://pypi.org/project/bitsandbytes/) and see docs
+
+In app.py update the memory_map max memory parameter to use when loading the model across gpus 
+this is only for the model, input_text parameters will add to this along with other pipeline variables 
+change to meet your gpu memory needs etc below works for 2 nvidia rtx 3090's at 8bit with "facebook/opt-30b"
+```
+memory_map = {0: "15GiB", 1: "20GiB", "cpu": "40GiB"}
+```
+For 8 bit to work the model needs to be able to fit on gpus vram, accelerate wont use the 
+cpu as well in 8 bit mode. You can load torch.float16 with torch_dtype = torch.float16, 
+set load_in_8bit=False, you can even use a ssd cache with offload_folder = "./path/to/ssd" 
+for parameter offloading
+
+#### 8 bit
+```
+generator = AutoModelForCausalLM.from_pretrained(
+    MODEL_NAME,
+    device_map="auto",
+    max_memory=memory_map,
+    load_in_8bit=True,
+)
+```
+
+#### float16
+```
+generator = AutoModelForCausalLM.from_pretrained(
+    MODEL_NAME,
+    device_map="auto",
+    torch_dtype = torch.float16
+    max_memory=memory_map,
+    load_in_8bit=False,
+    offload_folder = "./path/to/ssd"
+```
diff --git a/app.py b/app.py
index 87f6d79..5d48e1d 100644
--- a/app.py
+++ b/app.py
@@ -1,19 +1,40 @@
-import transformers
-from transformers import utils, pipeline, set_seed
 import torch
-from flask import Flask, request, render_template, session, redirect
-
+import transformers
+from flask import Flask, redirect, render_template, request, session
+from transformers import (AutoModelForCausalLM, AutoTokenizer, pipeline,
+                          set_seed, utils)
 
 app = Flask(__name__)
 
 # Set the secret key for the session
-app.secret_key = 'your-secret-key'
+app.secret_key = "your-secret-key"
 
-MODEL_NAME = "facebook/opt-125m" 
+MODEL_NAME = "facebook/opt-30b"
+MAX_NEW_TOKENS = 50
 
 # Initialize the chat history
-history = ["Human: Can you tell me the weather forecast for tomorrow?\nBot: Try checking a weather app like a normal person.\nHuman: Can you help me find a good restaurant in the area\nBot: Try asking someone with a functioning sense of taste.\n"]
-generator = pipeline('text-generation', model=f"{MODEL_NAME}", do_sample=True, torch_dtype=torch.half)
+history = [
+    "Human: Can you tell me the weather forecast for tomorrow?\nBot: Try checking a weather app like a normal person.\nHuman: Can you help me find a good restaurant in the area\nBot: Try asking someone with a functioning sense of taste.\n"
+]
+
+# max memory to use when loading the model across gpus
+# this is only for the model input_text will add to this
+# along with model interemt variables
+# change to meet your gpu memory etc
+# this works for 2 nvidia rtx 3090's at 8bit
+memory_map = {0: "15GiB", 1: "20GiB", "cpu": "40GiB"}
+
+# for 8 bit to work the model needs to be able to fit on gpus
+# this wont use the cpu as well you can load torch.float16
+# with torch_dtype = torch.float16, set load_in_8bit=False,
+# you can even use a ssd cache with offload_folder = "./path/to/ssd"
+generator = AutoModelForCausalLM.from_pretrained(
+    MODEL_NAME,
+    device_map="auto",
+    max_memory=memory_map,
+    load_in_8bit=True,
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, padding_side="left")
 
 
 # Define the chatbot logic
@@ -21,7 +42,14 @@ def chatbot_response(input_text, history):
     # Concatenate the input text and history list
     input_text = "\n".join(history) + "\nHuman: " + input_text + " Bot: "
     set_seed(32)
-    response_text = generator(input_text, max_length=1024, num_beams=1, num_return_sequences=1)[0]['generated_text']
+
+    # tokenize the input text and accelorate needs the data sent to 0
+    input_text = tokenizer.encode(input_text, return_tensors="pt").to(0)
+    # get raw response
+    response_text = generator.generate(input_text, max_new_tokens=MAX_NEW_TOKENS)
+    # decode the response
+    response_text = tokenizer.decode(response_text[0].tolist())
+
     # Extract the bot's response from the generated text
     response_text = response_text.split("Bot:")[-1]
     # Cut off any "Human:" or "human:" parts from the response
@@ -30,29 +58,33 @@ def chatbot_response(input_text, history):
     return response_text
 
 
-@app.route('/', methods=['GET', 'POST'])
+@app.route("/", methods=["GET", "POST"])
 def index():
     global history  # Make the history variable global
-    if request.method == 'POST':
-        input_text = request.form['input_text']
+    if request.method == "POST":
+        input_text = request.form["input_text"]
         response_text = chatbot_response(input_text, history)
         # Append the input and response to the chat history
         history.append(f"Human: {input_text}")
         history.append(f"Bot: {response_text}")
     else:
-        input_text = ''
-        response_text = ''
+        input_text = ""
+        response_text = ""
     # Render the template with the updated chat history
-    return render_template('index.html', input_text=input_text, response_text=response_text, history=history)
+    return render_template(
+        "index.html", input_text=input_text, response_text=response_text, history=history
+    )
 
 
-@app.route('/reset', methods=['POST'])
+@app.route("/reset", methods=["POST"])
 def reset():
     global history  # Make the history variable global
-    history = ["Bot: Hello, how can I help you today? I am a chatbot designed to assist with a variety of tasks and answer questions. You can ask me about anything from general knowledge to specific topics, and I will do my best to provide a helpful and accurate response. Please go ahead and ask me your first question.\n"]
+    history = [
+        "Bot: Hello, how can I help you today? I am a chatbot designed to assist with a variety of tasks and answer questions. You can ask me about anything from general knowledge to specific topics, and I will do my best to provide a helpful and accurate response. Please go ahead and ask me your first question.\n"
+    ]
     # Redirect to the chat page
-    return redirect('/')
+    return redirect("/")
 
 
-if __name__ == '__main__':
-    app.run(host='0.0.0.0', port=5001)
+if __name__ == "__main__":
+    app.run(host="0.0.0.0", port=5001)

From dbaf544f7f9287f662dbd814b92cae683db83214 Mon Sep 17 00:00:00 2001
From: zshobbs <32156712+zshobbs@users.noreply.github.com>
Date: Wed, 25 Jan 2023 22:58:01 +0000
Subject: [PATCH 2/2] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 72f41a8..109aefe 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@ I asked ChatGPT to build an LLM-based chatbot app and this was the result.
 <img src="https://pythonprogramming.net/static/images/chatgptathomesocial.png" width="512"/>
 
 ### Running float-16 and 8 bit across multiple GPU's
-Hugging face accelerate lets you run LLM's over multiple models,
+Hugging face accelerate lets you run LLM's over multiple GPU's,
 For half precission across multiple gpus accelerate is only needed.
 
 Install hugging face [accelerate](https://huggingface.co/docs/accelerate/v0.3.0/installation.html)