Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,45 @@ ChatGPT @ Home: Large Language Model (LLM) chatbot application, written by ChatG
I asked ChatGPT to build an LLM-based chatbot app and this was the result.

<img src="https://pythonprogramming.net/static/images/chatgptathomesocial.png" width="512"/>

### Running float-16 and 8 bit across multiple GPU's
Hugging face accelerate lets you run LLM's over multiple GPU's,
For half precission across multiple gpus accelerate is only needed.

Install hugging face [accelerate](https://huggingface.co/docs/accelerate/v0.3.0/installation.html)

For 8 bit bitsandbytes is needed (needs Turing or Ampere GPUs.)

Install [bitsandbytes](https://pypi.org/project/bitsandbytes/) and see docs

In app.py update the memory_map max memory parameter to use when loading the model across gpus
this is only for the model, input_text parameters will add to this along with other pipeline variables
change to meet your gpu memory needs etc below works for 2 nvidia rtx 3090's at 8bit with "facebook/opt-30b"
```
memory_map = {0: "15GiB", 1: "20GiB", "cpu": "40GiB"}
```
For 8 bit to work the model needs to be able to fit on gpus vram, accelerate wont use the
cpu as well in 8 bit mode. You can load torch.float16 with torch_dtype = torch.float16,
set load_in_8bit=False, you can even use a ssd cache with offload_folder = "./path/to/ssd"
for parameter offloading

#### 8 bit
```
generator = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
device_map="auto",
max_memory=memory_map,
load_in_8bit=True,
)
```

#### float16
```
generator = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
device_map="auto",
torch_dtype = torch.float16
max_memory=memory_map,
load_in_8bit=False,
offload_folder = "./path/to/ssd"
```
72 changes: 52 additions & 20 deletions app.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,55 @@
import transformers
from transformers import utils, pipeline, set_seed
import torch
from flask import Flask, request, render_template, session, redirect

import transformers
from flask import Flask, redirect, render_template, request, session
from transformers import (AutoModelForCausalLM, AutoTokenizer, pipeline,
set_seed, utils)

app = Flask(__name__)

# Set the secret key for the session
app.secret_key = 'your-secret-key'
app.secret_key = "your-secret-key"

MODEL_NAME = "facebook/opt-125m"
MODEL_NAME = "facebook/opt-30b"
MAX_NEW_TOKENS = 50

# Initialize the chat history
history = ["Human: Can you tell me the weather forecast for tomorrow?\nBot: Try checking a weather app like a normal person.\nHuman: Can you help me find a good restaurant in the area\nBot: Try asking someone with a functioning sense of taste.\n"]
generator = pipeline('text-generation', model=f"{MODEL_NAME}", do_sample=True, torch_dtype=torch.half)
history = [
"Human: Can you tell me the weather forecast for tomorrow?\nBot: Try checking a weather app like a normal person.\nHuman: Can you help me find a good restaurant in the area\nBot: Try asking someone with a functioning sense of taste.\n"
]

# max memory to use when loading the model across gpus
# this is only for the model input_text will add to this
# along with model interemt variables
# change to meet your gpu memory etc
# this works for 2 nvidia rtx 3090's at 8bit
memory_map = {0: "15GiB", 1: "20GiB", "cpu": "40GiB"}

# for 8 bit to work the model needs to be able to fit on gpus
# this wont use the cpu as well you can load torch.float16
# with torch_dtype = torch.float16, set load_in_8bit=False,
# you can even use a ssd cache with offload_folder = "./path/to/ssd"
generator = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
device_map="auto",
max_memory=memory_map,
load_in_8bit=True,
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, padding_side="left")


# Define the chatbot logic
def chatbot_response(input_text, history):
# Concatenate the input text and history list
input_text = "\n".join(history) + "\nHuman: " + input_text + " Bot: "
set_seed(32)
response_text = generator(input_text, max_length=1024, num_beams=1, num_return_sequences=1)[0]['generated_text']

# tokenize the input text and accelorate needs the data sent to 0
input_text = tokenizer.encode(input_text, return_tensors="pt").to(0)
# get raw response
response_text = generator.generate(input_text, max_new_tokens=MAX_NEW_TOKENS)
# decode the response
response_text = tokenizer.decode(response_text[0].tolist())

# Extract the bot's response from the generated text
response_text = response_text.split("Bot:")[-1]
# Cut off any "Human:" or "human:" parts from the response
Expand All @@ -30,29 +58,33 @@ def chatbot_response(input_text, history):
return response_text


@app.route('/', methods=['GET', 'POST'])
@app.route("/", methods=["GET", "POST"])
def index():
global history # Make the history variable global
if request.method == 'POST':
input_text = request.form['input_text']
if request.method == "POST":
input_text = request.form["input_text"]
response_text = chatbot_response(input_text, history)
# Append the input and response to the chat history
history.append(f"Human: {input_text}")
history.append(f"Bot: {response_text}")
else:
input_text = ''
response_text = ''
input_text = ""
response_text = ""
# Render the template with the updated chat history
return render_template('index.html', input_text=input_text, response_text=response_text, history=history)
return render_template(
"index.html", input_text=input_text, response_text=response_text, history=history
)


@app.route('/reset', methods=['POST'])
@app.route("/reset", methods=["POST"])
def reset():
global history # Make the history variable global
history = ["Bot: Hello, how can I help you today? I am a chatbot designed to assist with a variety of tasks and answer questions. You can ask me about anything from general knowledge to specific topics, and I will do my best to provide a helpful and accurate response. Please go ahead and ask me your first question.\n"]
history = [
"Bot: Hello, how can I help you today? I am a chatbot designed to assist with a variety of tasks and answer questions. You can ask me about anything from general knowledge to specific topics, and I will do my best to provide a helpful and accurate response. Please go ahead and ask me your first question.\n"
]
# Redirect to the chat page
return redirect('/')
return redirect("/")


if __name__ == '__main__':
app.run(host='0.0.0.0', port=5001)
if __name__ == "__main__":
app.run(host="0.0.0.0", port=5001)