Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

The new version 0.3.0 takes a long time for quantization and eventually fails due to OOM #965

Open
okwinds opened this issue Dec 10, 2024 · 5 comments
Assignees
Labels
bug Something isn't working

Comments

@okwinds
Copy link

okwinds commented Dec 10, 2024

Describe the bug
I used the sample code (W8A16) to quantize THUDM/glm-4-9b-chat-hf on an Nvidia 4090, and the entire process was very slow (nearly 24 hours), with extremely high memory usage, to the point where an Out of Memory (OOM) error occurred at the final step. When the OOM occurred, there were no obvious error messages, only displayed.

[1] 216936 killed python3 test_ct.py
WSL environment:
compressed-tensors 0.8.0
llmcompressor 0.3.0

Memory : 47 GB
Swap : 40 GB

Using the same example code and consistent environment, the versions were updated to compressed-tensors 0.7.0 and llmcompressor 0.2.0. The quantization process was completed smoothly, and it took only 2 hours.

Expected behavior
Hoping for performance improvements.

Environment

  1. OS Ubuntu 22.04
  2. Python version 3.11.9
  3. CUDA Version 12.4.1

Errors
OOM
[1] 216936 killed python3 test_ct.py

@okwinds okwinds added the bug Something isn't working label Dec 10, 2024
@horheynm
Copy link
Collaborator

@okwinds
Could you share me the example script you are using?

@horheynm horheynm mentioned this issue Dec 17, 2024
@horheynm
Copy link
Collaborator

@okwinds
Can you try this script?

from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.transformers import oneshot

MODEL_ID = "THUDM/glm-4-9b-chat-hf"
scheme = "W8A16"

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map="auto",
    torch_dtype="auto",
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

# Select calibration dataset.
DATASET_ID = "HuggingFaceH4/ultrachat_200k"
DATASET_SPLIT = "train_sft"

# Select number of samples. 512 samples is a good place to start.
# Increasing the number of samples can improve accuracy.
NUM_CALIBRATION_SAMPLES = 512
MAX_SEQUENCE_LENGTH = 2048

# Load dataset and preprocess.
ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))

print(MODEL_ID)
def preprocess(example):
    return {
        "text": tokenizer.apply_chat_template(
            example["messages"],
            tokenize=False,
        )
    }


ds = ds.map(preprocess)


# Tokenize inputs.
def tokenize(sample):
    return tokenizer(
        sample["text"],
        padding=False,
        max_length=MAX_SEQUENCE_LENGTH,
        truncation=True,
        add_special_tokens=False,
    )


ds = ds.map(tokenize, remove_columns=ds.column_names)

# Configure the quantization algorithm to run.
#   * quantize the weights to 4 bit with GPTQ with a group size 128
recipe = GPTQModifier(targets="Linear", scheme=scheme, ignore=["lm_head"])

# Apply algorithms.
oneshot(
    model=model,
    dataset=ds,
    recipe=recipe,
    max_seq_length=MAX_SEQUENCE_LENGTH,
    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
)

# Confirm generations of the quantized model look sane.
print("\n\n")
print("========== SAMPLE GENERATION ==============")
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
output = model.generate(input_ids, max_new_tokens=100)
print(tokenizer.decode(output[0]))
print("==========================================\n\n")

# Save to disk compressed.
SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A16-G128"
model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR)

@okwinds
Copy link
Author

okwinds commented Dec 24, 2024

@okwinds Can you try this script?你能试试这个脚本吗?

from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.transformers import oneshot

MODEL_ID = "THUDM/glm-4-9b-chat-hf"
scheme = "W8A16"

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map="auto",
    torch_dtype="auto",
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

# Select calibration dataset.
DATASET_ID = "HuggingFaceH4/ultrachat_200k"
DATASET_SPLIT = "train_sft"

# Select number of samples. 512 samples is a good place to start.
# Increasing the number of samples can improve accuracy.
NUM_CALIBRATION_SAMPLES = 512
MAX_SEQUENCE_LENGTH = 2048

# Load dataset and preprocess.
ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))

print(MODEL_ID)
def preprocess(example):
    return {
        "text": tokenizer.apply_chat_template(
            example["messages"],
            tokenize=False,
        )
    }


ds = ds.map(preprocess)


# Tokenize inputs.
def tokenize(sample):
    return tokenizer(
        sample["text"],
        padding=False,
        max_length=MAX_SEQUENCE_LENGTH,
        truncation=True,
        add_special_tokens=False,
    )


ds = ds.map(tokenize, remove_columns=ds.column_names)

# Configure the quantization algorithm to run.
#   * quantize the weights to 4 bit with GPTQ with a group size 128
recipe = GPTQModifier(targets="Linear", scheme=scheme, ignore=["lm_head"])

# Apply algorithms.
oneshot(
    model=model,
    dataset=ds,
    recipe=recipe,
    max_seq_length=MAX_SEQUENCE_LENGTH,
    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
)

# Confirm generations of the quantized model look sane.
print("\n\n")
print("========== SAMPLE GENERATION ==============")
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
output = model.generate(input_ids, max_new_tokens=100)
print(tokenizer.decode(output[0]))
print("==========================================\n\n")

# Save to disk compressed.
SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A16-G128"
model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR)

I quantized the model following this script, and then the issue I described earlier occurred.

@HelloCard
Copy link

HelloCard commented Dec 25, 2024

===== Compressing layer 40/40  =====
2024-12-25T23:28:46.630287+0800 | apply_compression | INFO - Calibrating model.layers.39...
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2048/2048 [01:31<00:00, 22.34it/s]
2024-12-25T23:30:18.304183+0800 | compress_module | INFO - Compressing model.layers.39.model.layers.39.self_attn.q_proj...
2024-12-25T23:30:19.708405+0800 | compress | METRIC - time 1.39
2024-12-25T23:30:19.710033+0800 | compress | METRIC - error 868.16
2024-12-25T23:30:19.710220+0800 | compress | METRIC - GPU 0 | usage: 46.83% | total memory: 25 GB
2024-12-25T23:30:19.710267+0800 | compress | METRIC - GPU 1 | usage: 62.10% | total memory: 25 GB
2024-12-25T23:30:19.710333+0800 | compress | METRIC - Compressed layer size: 41.955328 MB
2024-12-25T23:30:19.710594+0800 | compress_module | INFO - Compressing model.layers.39.model.layers.39.self_attn.k_proj...
2024-12-25T23:30:21.036351+0800 | compress | METRIC - time 1.33
2024-12-25T23:30:21.038086+0800 | compress | METRIC - error 247.34
2024-12-25T23:30:21.038430+0800 | compress | METRIC - GPU 0 | usage: 46.83% | total memory: 25 GB
2024-12-25T23:30:21.038486+0800 | compress | METRIC - GPU 1 | usage: 62.10% | total memory: 25 GB
2024-12-25T23:30:21.038563+0800 | compress | METRIC - Compressed layer size: 10.488832 MB
2024-12-25T23:30:21.038817+0800 | compress_module | INFO - Compressing model.layers.39.model.layers.39.self_attn.v_proj...
2024-12-25T23:30:22.371106+0800 | compress | METRIC - time 1.33
2024-12-25T23:30:22.373088+0800 | compress | METRIC - error 1044.24
2024-12-25T23:30:22.373275+0800 | compress | METRIC - GPU 0 | usage: 46.83% | total memory: 25 GB
2024-12-25T23:30:22.373321+0800 | compress | METRIC - GPU 1 | usage: 62.10% | total memory: 25 GB
2024-12-25T23:30:22.373386+0800 | compress | METRIC - Compressed layer size: 10.488832 MB
2024-12-25T23:30:22.373630+0800 | compress_module | INFO - Compressing model.layers.39.model.layers.39.self_attn.o_proj...
2024-12-25T23:30:23.494157+0800 | compress | METRIC - time 1.12
2024-12-25T23:30:23.495993+0800 | compress | METRIC - error 35.28
2024-12-25T23:30:23.496186+0800 | compress | METRIC - GPU 0 | usage: 46.83% | total memory: 25 GB
2024-12-25T23:30:23.496233+0800 | compress | METRIC - GPU 1 | usage: 62.10% | total memory: 25 GB
2024-12-25T23:30:23.496300+0800 | compress | METRIC - Compressed layer size: 41.9584 MB
2024-12-25T23:30:23.496565+0800 | compress_module | INFO - Compressing model.layers.39.model.layers.39.mlp.gate_proj...
2024-12-25T23:30:25.070994+0800 | compress | METRIC - time 1.57
2024-12-25T23:30:25.072680+0800 | compress | METRIC - error 1903.98
2024-12-25T23:30:25.072867+0800 | compress | METRIC - GPU 0 | usage: 46.83% | total memory: 25 GB
2024-12-25T23:30:25.072913+0800 | compress | METRIC - GPU 1 | usage: 66.01% | total memory: 25 GB
2024-12-25T23:30:25.072981+0800 | compress | METRIC - Compressed layer size: 146.843648 MB
2024-12-25T23:30:25.073232+0800 | compress_module | INFO - Compressing model.layers.39.model.layers.39.mlp.up_proj...
2024-12-25T23:30:26.664661+0800 | compress | METRIC - time 1.59
2024-12-25T23:30:26.666537+0800 | compress | METRIC - error 1177.15
2024-12-25T23:30:26.666810+0800 | compress | METRIC - GPU 0 | usage: 46.83% | total memory: 25 GB
2024-12-25T23:30:26.666887+0800 | compress | METRIC - GPU 1 | usage: 66.01% | total memory: 25 GB
2024-12-25T23:30:26.666971+0800 | compress | METRIC - Compressed layer size: 146.843648 MB
2024-12-25T23:30:26.667265+0800 | compress_module | INFO - Compressing model.layers.39.model.layers.39.mlp.down_proj...
2024-12-25T23:30:30.839260+0800 | compress | METRIC - time 4.17
2024-12-25T23:30:30.842626+0800 | compress | METRIC - error 1571.74
2024-12-25T23:30:30.842823+0800 | compress | METRIC - GPU 0 | usage: 46.83% | total memory: 25 GB
2024-12-25T23:30:30.842872+0800 | compress | METRIC - GPU 1 | usage: 72.39% | total memory: 25 GB
2024-12-25T23:30:30.842937+0800 | compress | METRIC - Compressed layer size: 146.816 MB
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2048/2048 [00:24<00:00, 84.12it/s]
2024-12-25T23:30:58.297551+0800 | apply_compression | INFO - Mean output error from quantization: 0.016
manager stage: Modifiers initialized
2024-12-25T23:30:58.369488+0800 | initialize | INFO - Compression lifecycle initialized for 1 modifiers
manager stage: Modifiers finalized
2024-12-25T23:30:58.370182+0800 | finalize | INFO - Compression lifecycle finalized for 1 modifiers
2024-12-25T23:31:46.299670+0800 | get_model_compressor | INFO - Inferring a sparsity configuration requires a global sparsity calculation. This can be costly for large models. To skip the calculation of compression statistics set skip_compression_stats=True
Calculating model sparsity:   0%|                                                                                                                                                                     | 0/923 [00:00<?, ?it/s]Killed
root@autodl-container-df6c4fa6bd-4ed9365a:~/autodl-tmp#

same problem

from llmcompressor.transformers import SparseAutoModelForCausalLM
from transformers import AutoTokenizer
import torch
MODEL_ID = "/root/autodl-tmp/SAINEMO-reMIX"
model = SparseAutoModelForCausalLM.from_pretrained(
    MODEL_ID, device_map="auto", torch_dtype="auto",
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

from datasets import load_dataset

NUM_CALIBRATION_SAMPLES = 2048
MAX_SEQUENCE_LENGTH = 2048

# Load and preprocess the dataset
ds = load_dataset("/root/autodl-tmp/ultrachat_2k", split="train_sft")
ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))

def preprocess(example):
    return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
ds = ds.map(preprocess)

def tokenize(sample):
    return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
ds = ds.map(tokenize, remove_columns=ds.column_names)


from llmcompressor.transformers import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier

# Configure the quantization algorithms

recipe = [
    SmoothQuantModifier(smoothing_strength=0.86),
    GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"], sequential_update=True),
]

# Apply quantization
oneshot(
    model=model,
    dataset=ds,
    recipe=recipe,
    max_seq_length=MAX_SEQUENCE_LENGTH,
    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
)

# Save the compressed model
SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token"
model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR)

@dsikka
Copy link
Collaborator

dsikka commented Jan 9, 2025

Hi, can you try providing the following input for the device map?

from llmcompressor.transformers.compression.helpers import calculate_offload_device_map

device_map = calculate_offload_device_map(
    MODEL_ID, num_gpus=n_gpus, reserve_for_hessians=True, torch_dtype=torch.bfloat16
)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
bug Something isn't working
Projects
None yet
Development

No branches or pull requests

4 participants