In [ ]:
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
#!pip install "unsloth[colab-new]" huggingface transformers bitsandbytes
#!pip install -v -U git+https://github.com/facebookresearch/xformers.git@main
#!pip install trl peft datasets flash-attn
In [ ]:
import torch
from unsloth import FastLanguageModel, is_bfloat16_supported
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import load_dataset
🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
In [ ]:
max_seq_length = 2048
load_in_4bit = True
fourbits_model = "unsloth/Meta-Llama-3.1-8B-bnb-4bit"
In [ ]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = fourbits_model,
    max_seq_length = max_seq_length,
    dtype = None,
    load_in_4bit = load_in_4bit,
    attn_implementation="flash_attention_2"
)
==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.0.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28+33a51bd.d20240816. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
In [ ]:
# Do model patching and add fast LoRA weights
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    max_seq_length = max_seq_length,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)
Unsloth 2024.8 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.
In [ ]:
# load dataset
url = "https://huggingface.co/datasets/laion/OIG/resolve/main/unified_chip2.jsonl"
dataset = load_dataset("json", data_files = {"train" : url}, split = "train")
In [ ]:
dataset.data[0]

Human: Describe the location where you find yourself in a serene scene inside of a medieval inn with a rice paper screen.

Bot: Sturdy rectory tables and low benches offer a quiet place to eat and drink food brought with you or bought from the nearby bakery. At the back of the room, the door leading to the kitchen and the owner's private quarters is half-hidden by a rice paper screen.",


Human: I want to find out more about a certain type of bird.

Bot: That sounds like a great idea. What kind of bird are you interested in?",


Human: round number to nearest integer

Bot: Here is some code to round number 4.0005 up to 3 decimal places, round(4.0005, 3)


Human: I don't understand my tax form. Can you help explain?

Bot: I'm sorry, I'm not able to help with your tax form. It is best to consult a tax specialist or the IRS for further explanation.",

In [ ]:
trainer = SFTTrainer(
    model = model,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    tokenizer = tokenizer,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 20,
        max_steps = 120,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        learning_rate = 5e-5,
        weight_decay = 0.01,
        output_dir = "outputs",
        optim = "adamw_8bit",
        lr_scheduler_type = "linear",
        seed = 3407,
    ),
)
In [ ]:
model

PeftModelForCausalLM

- (base_model): LoraModel

    - (model): LlamaForCausalLM

        - (model): LlamaModel()

        - (lm_head): Linear(in_features=4096, out_features=128256, bias=False)
In [ ]:
#start training
trainer.train()
trainer.save_model("finetuned_llm")

Step Training Loss

1 1.737900

10 1.051600

20 1.380700

30 1.025000

40 1.595500

50 1.414300

60 1.337900

70 1.288200

80 1.418700

90 1.016300

100 0.969700

110 1.292600

120 1.123400