From 1f27a9abeddde91058ae0a866bcb928e38de23dc Mon Sep 17 00:00:00 2001 From: zshobbs Date: Wed, 25 Jan 2023 22:56:04 +0000 Subject: [PATCH 1/2] add HF accelerate for models --- README.md | 42 ++++++++++++++++++++++++++++++++ app.py | 72 +++++++++++++++++++++++++++++++++++++++---------------- 2 files changed, 94 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 7431553..72f41a8 100644 --- a/README.md +++ b/README.md @@ -4,3 +4,45 @@ ChatGPT @ Home: Large Language Model (LLM) chatbot application, written by ChatG I asked ChatGPT to build an LLM-based chatbot app and this was the result. + +### Running float-16 and 8 bit across multiple GPU's +Hugging face accelerate lets you run LLM's over multiple models, +For half precission across multiple gpus accelerate is only needed. + +Install hugging face [accelerate](https://huggingface.co/docs/accelerate/v0.3.0/installation.html) + +For 8 bit bitsandbytes is needed (needs Turing or Ampere GPUs.) + +Install [bitsandbytes](https://pypi.org/project/bitsandbytes/) and see docs + +In app.py update the memory_map max memory parameter to use when loading the model across gpus +this is only for the model, input_text parameters will add to this along with other pipeline variables +change to meet your gpu memory needs etc below works for 2 nvidia rtx 3090's at 8bit with "facebook/opt-30b" +``` +memory_map = {0: "15GiB", 1: "20GiB", "cpu": "40GiB"} +``` +For 8 bit to work the model needs to be able to fit on gpus vram, accelerate wont use the +cpu as well in 8 bit mode. You can load torch.float16 with torch_dtype = torch.float16, +set load_in_8bit=False, you can even use a ssd cache with offload_folder = "./path/to/ssd" +for parameter offloading + +#### 8 bit +``` +generator = AutoModelForCausalLM.from_pretrained( + MODEL_NAME, + device_map="auto", + max_memory=memory_map, + load_in_8bit=True, +) +``` + +#### float16 +``` +generator = AutoModelForCausalLM.from_pretrained( + MODEL_NAME, + device_map="auto", + torch_dtype = torch.float16 + max_memory=memory_map, + load_in_8bit=False, + offload_folder = "./path/to/ssd" +``` diff --git a/app.py b/app.py index 87f6d79..5d48e1d 100644 --- a/app.py +++ b/app.py @@ -1,19 +1,40 @@ -import transformers -from transformers import utils, pipeline, set_seed import torch -from flask import Flask, request, render_template, session, redirect - +import transformers +from flask import Flask, redirect, render_template, request, session +from transformers import (AutoModelForCausalLM, AutoTokenizer, pipeline, + set_seed, utils) app = Flask(__name__) # Set the secret key for the session -app.secret_key = 'your-secret-key' +app.secret_key = "your-secret-key" -MODEL_NAME = "facebook/opt-125m" +MODEL_NAME = "facebook/opt-30b" +MAX_NEW_TOKENS = 50 # Initialize the chat history -history = ["Human: Can you tell me the weather forecast for tomorrow?\nBot: Try checking a weather app like a normal person.\nHuman: Can you help me find a good restaurant in the area\nBot: Try asking someone with a functioning sense of taste.\n"] -generator = pipeline('text-generation', model=f"{MODEL_NAME}", do_sample=True, torch_dtype=torch.half) +history = [ + "Human: Can you tell me the weather forecast for tomorrow?\nBot: Try checking a weather app like a normal person.\nHuman: Can you help me find a good restaurant in the area\nBot: Try asking someone with a functioning sense of taste.\n" +] + +# max memory to use when loading the model across gpus +# this is only for the model input_text will add to this +# along with model interemt variables +# change to meet your gpu memory etc +# this works for 2 nvidia rtx 3090's at 8bit +memory_map = {0: "15GiB", 1: "20GiB", "cpu": "40GiB"} + +# for 8 bit to work the model needs to be able to fit on gpus +# this wont use the cpu as well you can load torch.float16 +# with torch_dtype = torch.float16, set load_in_8bit=False, +# you can even use a ssd cache with offload_folder = "./path/to/ssd" +generator = AutoModelForCausalLM.from_pretrained( + MODEL_NAME, + device_map="auto", + max_memory=memory_map, + load_in_8bit=True, +) +tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, padding_side="left") # Define the chatbot logic @@ -21,7 +42,14 @@ def chatbot_response(input_text, history): # Concatenate the input text and history list input_text = "\n".join(history) + "\nHuman: " + input_text + " Bot: " set_seed(32) - response_text = generator(input_text, max_length=1024, num_beams=1, num_return_sequences=1)[0]['generated_text'] + + # tokenize the input text and accelorate needs the data sent to 0 + input_text = tokenizer.encode(input_text, return_tensors="pt").to(0) + # get raw response + response_text = generator.generate(input_text, max_new_tokens=MAX_NEW_TOKENS) + # decode the response + response_text = tokenizer.decode(response_text[0].tolist()) + # Extract the bot's response from the generated text response_text = response_text.split("Bot:")[-1] # Cut off any "Human:" or "human:" parts from the response @@ -30,29 +58,33 @@ def chatbot_response(input_text, history): return response_text -@app.route('/', methods=['GET', 'POST']) +@app.route("/", methods=["GET", "POST"]) def index(): global history # Make the history variable global - if request.method == 'POST': - input_text = request.form['input_text'] + if request.method == "POST": + input_text = request.form["input_text"] response_text = chatbot_response(input_text, history) # Append the input and response to the chat history history.append(f"Human: {input_text}") history.append(f"Bot: {response_text}") else: - input_text = '' - response_text = '' + input_text = "" + response_text = "" # Render the template with the updated chat history - return render_template('index.html', input_text=input_text, response_text=response_text, history=history) + return render_template( + "index.html", input_text=input_text, response_text=response_text, history=history + ) -@app.route('/reset', methods=['POST']) +@app.route("/reset", methods=["POST"]) def reset(): global history # Make the history variable global - history = ["Bot: Hello, how can I help you today? I am a chatbot designed to assist with a variety of tasks and answer questions. You can ask me about anything from general knowledge to specific topics, and I will do my best to provide a helpful and accurate response. Please go ahead and ask me your first question.\n"] + history = [ + "Bot: Hello, how can I help you today? I am a chatbot designed to assist with a variety of tasks and answer questions. You can ask me about anything from general knowledge to specific topics, and I will do my best to provide a helpful and accurate response. Please go ahead and ask me your first question.\n" + ] # Redirect to the chat page - return redirect('/') + return redirect("/") -if __name__ == '__main__': - app.run(host='0.0.0.0', port=5001) +if __name__ == "__main__": + app.run(host="0.0.0.0", port=5001) From dbaf544f7f9287f662dbd814b92cae683db83214 Mon Sep 17 00:00:00 2001 From: zshobbs <32156712+zshobbs@users.noreply.github.com> Date: Wed, 25 Jan 2023 22:58:01 +0000 Subject: [PATCH 2/2] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 72f41a8..109aefe 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ I asked ChatGPT to build an LLM-based chatbot app and this was the result. ### Running float-16 and 8 bit across multiple GPU's -Hugging face accelerate lets you run LLM's over multiple models, +Hugging face accelerate lets you run LLM's over multiple GPU's, For half precission across multiple gpus accelerate is only needed. Install hugging face [accelerate](https://huggingface.co/docs/accelerate/v0.3.0/installation.html)