GPU memory usage increases at each inference #70

FSet89 · 2024-08-19T07:32:11Z

I am testing the 8b model with a custom simple api:

from fastapi import FastAPI, UploadFile, File, Form
from pydantic import BaseModel
from PIL import Image
import io
from typing import List
import torch
import numpy as np
import random
import requests
import sys
sys.path.append('cambrian')
from cambrian.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
from cambrian.conversation import conv_templates, SeparatorStyle
from cambrian.model.builder import load_pretrained_model
from cambrian.utils import disable_torch_init
from cambrian.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path


def process(image, question, tokenizer, image_processor, model_config):
    qs = question

    if model_config.mm_use_im_start_end:
        qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
    else:
        qs = DEFAULT_IMAGE_TOKEN + '\n' + qs

    conv = conv_templates[conv_mode].copy()
    conv.append_message(conv.roles[0], qs)
    conv.append_message(conv.roles[1], None)
    prompt = conv.get_prompt()
    
    image_size = [image.size]
    image_tensor = process_images([image], image_processor, model_config)

    input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()

    return input_ids, image_tensor, image_size, prompt


app = FastAPI()

## LOAD MODEL ##
conv_mode = "llama_3" 
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

model_path = '/home/ubuntu/nyu-visionx/cambrian-8b'
model_name = get_model_name_from_path(model_path)
tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, None, model_name)

temperature = 0

class InferenceRequest(BaseModel):
    question: str

@app.post("/predict/")
async def predict(images: List[str] = Form(...), question: str = Form(...)):
    image = Image.open(requests.get(images[0], stream=True).raw)

    input_ids, image_tensor, image_sizes, prompt = process(image, question, tokenizer, image_processor, model.config)
    input_ids = input_ids.to(device='cuda', non_blocking=True)
    with torch.inference_mode():
        output_ids = model.generate(
            input_ids,
            images=image_tensor,
            image_sizes=image_sizes,
            do_sample=True if temperature > 0 else False,
            temperature=temperature,
            num_beams=1,
            max_new_tokens=256,
            use_cache=True)

    outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()   
    print("Outputs", outputs)
    return {"response": [outputs]}

When the model is loaded, nvidia-smi shows 21684MiB of memory usage. However, after the first query the memory usage increases to 22356MiB. Is this normal? How can I free the memory after each query?

The text was updated successfully, but these errors were encountered:

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

GPU memory usage increases at each inference #70

GPU memory usage increases at each inference #70

FSet89 commented Aug 19, 2024

GPU memory usage increases at each inference #70

GPU memory usage increases at each inference #70

Comments

FSet89 commented Aug 19, 2024