1. 필요 모듈 설치
➢ 중요한 모듈만 언급
- torch (https://pytorch.org/get-started/previous-versions/) local 컴퓨터 버전에 맞게 알맞게 설치하기..
# CUDA 11.7
conda install pytorch==1.13.0 torchvision==0.14.0 torchaudio==0.13.0 pytorch-cuda=11.7 -c pytorch -c nvidia
- transformers
pip install transformers
- pytorch-ligtning
pip install pytorch-lightning
2. 데이터 셋 준비
- 주의사항: DETR 의 데이터셋 format은 COCO format 으로 준비되어야 함.
(다운 받은 데이터 셋의 format을 확인하자, Pascal VOC, YOLO, COCO 등등.. if 변환을 해야한다면 변환 코드를 찾아서 돌리면 됨.)
► 아래는 데이터셋 구할만 한 사이트.
- https://paperswithcode.com/datasets
► Dataset format 설명.
3. Pytorch Custom dataset
→ 아래와 같이 CustomDataset을 만들어야 함.
from torch.utils.data import Dataset
class CustomDataset(Dataset):
def __init__(self):
# 생성자, 데이터를 전처리 하는 부분
def __len__(self):
# 데이터셋의 총 길이를 반환하는 부분
def __getitem__(self,idx):
# idx(인덱스)에 해당하는 입출력 데이터를 반환한다.
# 출처 : https://wikidocs.net/156998
→ COCO-Format임으로 아래와 같은 CustomDataset을 활용함.
import torchvision
import os
class CocoDetection(torchvision.datasets.CocoDetection):
def __init__(self, img_folder, processor, train=True):
ann_file = os.path.join(img_folder, "annotations_VisDrone_train.json" if train else "annotations_VisDrone_val.json")
super(CocoDetection, self).__init__(img_folder, ann_file)
self.processor = processor
def __getitem__(self, idx):
# read in PIL image and target in COCO format
# feel free to add data augmentation here before passing them to the next step
img, target = super(CocoDetection, self).__getitem__(idx)
# preprocess image and target (converting target to DETR format, resizing + normalization of both image and target)
image_id = self.ids[idx]
target = {'image_id': image_id, 'annotations': target}
encoding = self.processor(images=img, annotations=target, return_tensors="pt")
pixel_values = encoding["pixel_values"].squeeze() # remove batch dimension
target = encoding["labels"][0] # remove batch dimension
return pixel_values, target
→ COCODetection - CustomDataset 적용
# 전처리를 위해 DETRImageProcessor import 함.
from transformers import DetrImageProcessor
processor_DETR = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
train_dataset = CocoDetection(img_folder='../98.datasets/VisDrone2019/vis_train/',
processor=processor_DETR)
val_dataset = CocoDetection(img_folder='../98.datasets/VisDrone2019/vis_val/',
processor=processor_DETR, train=False)
print("Number of training examples:", len(train_dataset))
print("Number of validation examples:", len(val_dataset))
4. 입력 Data 시각화 수행.
import numpy as np
import os
from PIL import Image, ImageDraw
# based on https://github.com/woctezuma/finetune-detr/blob/master/finetune_detr.ipynb
image_ids = train_dataset.coco.getImgIds()
# let's pick a random image
image_id = image_ids[np.random.randint(0, len(image_ids))]
print('Image n°{}'.format(image_id))
image = train_dataset.coco.loadImgs(image_id)[0]
image = Image.open(os.path.join('../98.datasets/VisDrone2019/vis_train', image['file_name']))
annotations = train_dataset.coco.imgToAnns[image_id]
draw = ImageDraw.Draw(image, "RGBA")
cats = train_dataset.coco.cats
id2label = {k: v['name'] for k,v in cats.items()}
print(id2label)
for annotation in annotations:
box = annotation['bbox']
class_idx = annotation['category_id']
x,y,w,h = tuple(box)
draw.rectangle((x,y,x+w,y+h), outline='red', width=1)
draw.text((x, y), id2label[class_idx], fill='white')
image
print(len(id2label))
4. Pytorch Dataloader 수행.
→ 여기서 collate_fn 은 batch_size로 묶인 데이터 각각을 같은 길이로 padding 하는 코드. 다시말하자면, batch_size가 1이라면 길이가 상관이 없이 적용되지만 batch_size가 2 이상 이라면 모든 데이터의 길이는 같지 않음으로 오류가 발생함. 이를 해결하기 위해 collate_fn을 적용함.
from torch.utils.data import DataLoader
def collate_fn(batch):
pixel_values = [item[0] for item in batch]
encoding = processor_DETR.pad(pixel_values, return_tensors="pt")
labels = [item[1] for item in batch]
batch = {}
batch['pixel_values'] = encoding['pixel_values']
batch['pixel_mask'] = encoding['pixel_mask']
batch['labels'] = labels
return batch
train_dataloader = DataLoader(train_dataset, collate_fn=collate_fn, batch_size=4, shuffle=True, num_workers=79)
val_dataloader = DataLoader(val_dataset, collate_fn=collate_fn, batch_size=2, num_workers=79)
batch = next(iter(train_dataloader))
print('batch key :',batch.keys())
print()
pixel_values, target=train_dataset[0]
print(pixel_values.shape)
print(target)
5. Pytorch Lightning 수행.
→ pytorch lightning이란? High-level 인터페이스를 제공하는 오픈소스 python 라이브러리임. pytorch를 사용하였을때는 코드를 여러줄 입력해서 이것저것 설정해주어야 했지만, lightning은 코드를 정돈된 느낌으로 작성 할 수있도록 해주는 라이브러리이다.
→ ignore_mismatched_sizes=True 로 설정해야함. pretrained되어있는 class 에 맞지않아도 알아서 맞춰주는 파라미터 인자 값.
(DETR은 92개의 class인데 내가 설정한 num_labels 의 인자값에 맞춰 class를 설정하고 학습함)
import pytorch_lightning as pl
from transformers import DetrForObjectDetection
import torch
class Detr(pl.LightningModule):
def __init__(self, lr, lr_backbone, weight_decay):
super().__init__()
# replace COCO classification head with custom head
# we specify the "no_timm" variant here to not rely on the timm library
# for the convolutional backbone
# num_labels : number of classes
# ignore_mismatched_sizes : whether to ignore mismatches between the model's output and the target sizes
self.model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50",
revision="no_timm",
id2label=id2label,
num_labels=len(id2label),
ignore_mismatched_sizes=True)
# see https://github.com/PyTorchLightning/pytorch-lightning/pull/1896
self.lr = lr
self.lr_backbone = lr_backbone
self.weight_decay = weight_decay
def forward(self, pixel_values, pixel_mask):
outputs = self.model(pixel_values=pixel_values, pixel_mask=pixel_mask)
return outputs
def common_step(self, batch, batch_idx):
pixel_values = batch["pixel_values"]
pixel_mask = batch["pixel_mask"]
labels = [{k: v.to(self.device) for k, v in t.items()} for t in batch["labels"]]
outputs = self.model(pixel_values=pixel_values, pixel_mask=pixel_mask, labels=labels)
loss = outputs.loss
loss_dict = outputs.loss_dict
return loss, loss_dict
def training_step(self, batch, batch_idx):
loss, loss_dict = self.common_step(batch, batch_idx)
# logs metrics for each training_step,
# and the average across the epoch
self.log("training_loss", loss)
for k,v in loss_dict.items():
self.log("train_" + k, v.item())
return loss
def validation_step(self, batch, batch_idx):
loss, loss_dict = self.common_step(batch, batch_idx)
self.log("validation_loss", loss)
for k,v in loss_dict.items():
self.log("validation_" + k, v.item())
return loss
def configure_optimizers(self):
param_dicts = [
{"params": [p for n, p in self.named_parameters() if "backbone" not in n and p.requires_grad]},
{
"params": [p for n, p in self.named_parameters() if "backbone" in n and p.requires_grad],
"lr": self.lr_backbone,
},
]
optimizer = torch.optim.AdamW(param_dicts, lr=self.lr,
weight_decay=self.weight_decay)
return optimizer
def train_dataloader(self):
return train_dataloader
def val_dataloader(self):
return val_dataloader
model_detr = Detr(lr=1e-4, lr_backbone=1e-5, weight_decay=1e-4)
outputs = model_detr(pixel_values=batch['pixel_values'], pixel_mask=batch['pixel_mask'])
# 모델의 구조를 수정하거나 했을때 구조 확인을 위한 출력
print('model state_dict keys ')
for i in model_detr.state_dict().keys():
print(i)
## 총 파라미터수/ 학습가능한 파라미터수/ 고정된 파라미터수
total_params = sum(p.numel() for p in model_detr.parameters())
print("Total Parameters:", total_params)
total_trainable_params = sum(p.numel() for p in model_detr.parameters() if p.requires_grad)
print("Total Trainable Parameters:", total_trainable_params)
total_fixed_params = sum(p.numel() for p in model_detr.parameters() if not p.requires_grad)
print("Total Fixed Parameters:", total_fixed_params)
6. Training
→ early stopping 을 적용할 수 있음.
from pytorch_lightning import Trainer
import os
## early stopping적용 코드
# from lightning.pytorch.callbacks.early_stopping import EarlyStoppin
# early_stop_callback = pl.callbacks.EarlyStopping(monitor="validation_loss", min_delta=0.00, patience=3, verbose=False, mode="min")
# trainer = Trainer(accelerator="gpu",devices=[3],max_epochs=30 ,callbacks=[early_stop_callback],gradient_clip_val=0.1, accumulate_grad_batches=8, log_every_n_steps=5)
# trainer.fit(model_detr)
trainer = Trainer(accelerator="gpu",devices=[0],max_epochs=150 ,gradient_clip_val=0.1, accumulate_grad_batches=8, log_every_n_steps=5)
trainer.fit(model_detr)
7. Huggingface에 올리기.
import huggingface_hub
huggingface-cli login
model_detr.model.push_to_hub("name/detr_custom")
processor_DETR.push_to_hub("name/detr_custom")
8. 모델 Evaluation.
→ 7 번과정에서 작성했던 디렉토리에 맞게 주소를 입력해서 다운받으면 됨.
# model load
from transformers import DetrImageProcessor, DetrForObjectDetection
import torch
model = DetrForObjectDetection.from_pretrained("name/detr_custom")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
processor = DetrImageProcessor.from_pretrained("name/detr_custom")
def convert_to_xywh(boxes):
xmin, ymin, xmax, ymax = boxes.unbind(1)
return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=1)
def prepare_for_coco_detection(predictions):
coco_results = []
for original_id, prediction in predictions.items():
if len(prediction) == 0:
continue
boxes = prediction["boxes"]
boxes = convert_to_xywh(boxes).tolist()
scores = prediction["scores"].tolist()
labels = prediction["labels"].tolist()
coco_results.extend(
[
{
"image_id": original_id,
"category_id": labels[k],
"bbox": box,
"score": scores[k],
}
for k, box in enumerate(boxes)
]
)
return coco_results
→ run evaluation code
from coco_eval import CocoEvaluator
from tqdm.notebook import tqdm
import numpy as np
# initialize evaluator with ground truth (gt)
evaluator = CocoEvaluator(coco_gt=val_dataset.coco, iou_types=["bbox"])
print("Running evaluation...")
for idx, batch in enumerate(tqdm(val_dataloader)):
# get the inputs
pixel_values = batch["pixel_values"].to(device)
pixel_mask = batch["pixel_mask"].to(device)
labels = [{k: v.to(device) for k, v in t.items()} for t in batch["labels"]] # these are in DETR format, resized + normalized
# forward pass
with torch.no_grad():
outputs = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
# turn into a list of dictionaries (one item for each example in the batch)
orig_target_sizes = torch.stack([target["orig_size"] for target in labels], dim=0)
results = processor.post_process_object_detection(outputs, target_sizes=orig_target_sizes, threshold=0)
# provide to metric
# metric expects a list of dictionaries, each item
# containing image_id, category_id, bbox and score keys
predictions = {target['image_id'].item(): output for target, output in zip(labels, results)}
predictions = prepare_for_coco_detection(predictions)
evaluator.update(predictions)
evaluator.synchronize_between_processes()
evaluator.accumulate()
evaluator.summarize()
9. Inference ( +시각화).
# 임의의 val_dataset가져옴 : val_dataset[1]
pixel_values, target = val_dataset[1]
pixel_values = pixel_values.unsqueeze(0).to(device)
print(pixel_values.shape)
with torch.no_grad():
# forward pass to get class logits and bounding boxes
outputs = model(pixel_values=pixel_values, pixel_mask=None)
print("Outputs:", outputs.keys())
import matplotlib.pyplot as plt
# colors for visualization
COLORS = [[0.000, 0.447, 0.741], [0.850, 0.325, 0.098], [0.929, 0.694, 0.125],
[0.494, 0.184, 0.556], [0.466, 0.674, 0.188], [0.301, 0.745, 0.933]]
def plot_results(pil_img, scores, labels, boxes):
plt.figure(figsize=(16,10))
plt.imshow(pil_img)
ax = plt.gca()
colors = COLORS * 100
for score, label, (xmin, ymin, xmax, ymax),c in zip(scores.tolist(), labels.tolist(), boxes.tolist(), colors):
ax.add_patch(plt.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin,
fill=False, color=c, linewidth=3))
text = f'{model.config.id2label[label]}: {score:0.2f}'
ax.text(xmin, ymin, text, fontsize=15,
bbox=dict(facecolor='yellow', alpha=0.5))
plt.axis('off')
plt.show()
# load image based on ID
image_id = target['image_id'].item()
image = val_dataset.coco.loadImgs(image_id)[0]
image = Image.open(os.path.join('../98.datasets/VisDrone2019/vis_val', image['file_name']))
# postprocess model outputs
width, height = image.size
postprocessed_outputs = processor.post_process_object_detection(outputs,
target_sizes=[(height, width)],
threshold=0.9)
results = postprocessed_outputs[0]
plot_results(image, results['scores'], results['labels'], results['boxes'])
9. 기타 정보.
- Parmas
- Trainable params : Encoder, Decoder, FFN
- Non-trainable parmas : ResNet
paper
- https://arxiv.org/abs/2005.12872
reference
- https://blog.roboflow.com/train-detr-on-custom-dataset/
- https://github.com/NielsRogge/Transformers-Tutorials/tree/master/DETR
'인공지능 (기본 딥러닝) > 딥러닝 사이드 Project' 카테고리의 다른 글
[CNN-Pytorch, Finetuning] 커스텀 데이터 활용 이미지 분류 예제 코드 설명 (0) | 2024.06.23 |
---|---|
[Image Captioning] 이미지 캡셔닝 튜토리얼 (0) | 2024.05.19 |
[MediaPipe] Gesture Recognition 을 이용한 모션인식 (5) | 2023.11.18 |
[ViT] 비전 트랜스포머 코드구현 및 실행. (0) | 2023.08.11 |
AutoFormer 코드 설명 및 적용. (0) | 2023.07.16 |