i was writer one tensorflow yolo, But my loss has not been updated. Can someone check it for me?
my all code github link :
my optimizer update code:
if ni <= nw:
xi = [0,nw]
# ?奇怪,为啥要获取这个?
# 这个东西大概意思是求nbs/tbs 的第ni个线性变化值,最小必须是1,然后会做取整操作
# 累计?累计了什么?
accumulate = max(1,np.interp(ni, xi, [1, nbs / total_batch_size]).round())
# 这是在干嘛?
# for j, x in enumerate(opt):
#通过_set_hyper调整lr和mom
# optimizer._set_hyper("learning_rate", np.interp(ni, xi, 0.0, hyp['lr0'] * lf(epoch)))
optimizer._set_hyper("momentum", np.interp(ni, xi, [hyp['warmup_momentum'], hyp['momentum']]))
# 是否要做缩放
if opt.multi_scale:
sz = random.randrange(imgsz * 0.5, imgsz * 1.5 * gs) // gs * gs
sf = sz / max(imgs.shape[2:]) # scale factor
if sf != 1:
ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]]# new shape (stretched to gs-multiple)
imgs = tf.image.resize(imgs, ns , tf.image.ResizeMethod.BILINEAR,False)
#Forward
with tf.GradientTape() as gt:
pred = model(imgs)
loss, loss_items = compute_loss(pred,targets,model)
print(loss_items,end='')
print(loss)
if rank != -1 :
# gradient averaged between devices in DDP mode
# ddp模式下需要配置设备间的梯度平均值
loss *= opt.world_size
grads = gt.gradient(loss,model.trainable_variables)
optimizer.apply_gradients((grad, var) for (grad, var) in zip(grads,model.trainable_variables) if grad is not None) # 优化函数应用梯度进行优化
# optimizer.apply_gradients(zip(grads,model.trainable_variables)) # 不要使用trainable_variables**strong text**
my trian code
for epoch in range(start_epoch,epochs):
# 更新图片权重
if opt.image_weights:
# 生成索引
if rank in [-1, 0]:
cw = model.class_widths * (1 - maps) ** 2 /nc # 类权重(class weights)
iw = model.class_widths * (1 - maps) ** 2 /nc # 图片权重(image weights)
dataset.indices = random.choice(range(dataset.n),weights=id, k = dataset.n)
#如果是ddp模式(ddp大概是分布式训练?),就需要配置广播
# if rank != -1 :
# indices = np.array[dataset.indices] if rank == 0 else np.zeros(dataset.n,dtype=np.int8)
mloss = np.zeros(5)
logger.info(('\n' + '%10s' * 9) % ('Epoch', 'gpu_mem', 'box', 'obj', 'cls', 'landmark', 'total', 'targets', 'img_size'))
# dataset len
pbar = range(nb)
if(rank in [-1,0]):
pbar = tqdm(pbar,total=nb)
for i in pbar:
# number integrated batches (since train start)
# 这个参大概意思是本次训练累计使用了多少图片了
ni = i + nb * epoch
# 通过批次数读取一次epochs所需要使用到的数据
(imgs, targets, paths)=[],[],[]
batch_index = 0
for imgi in range(i * batch_size , i * batch_size + batch_size):
# 下表从0开始,删一个
if(imgi < dl ):
img,target,path = dataset.__getitem__(imgi)
# 在getitem里改图片的shape有点困难,还是在外面改吧
if(opt.format=='NHWC'):
img = tf.transpose(img,perm=[1,2,0]).numpy()
imgs.append(img)
targets.append(target)
paths.append(path)
(imgs, targets, paths) = dataset.collate_fn(imgs,targets,paths)
imgs = np.array(imgs,dtype=np.float32) / 255.0 #将图片从uint8的0-255转化为float32的0到1
#预处理了图像一下
if ni <= nw:
xi = [0,nw]
# ?奇怪,为啥要获取这个?
# 这个东西大概意思是求nbs/tbs 的第ni个线性变化值,最小必须是1,然后会做取整操作
# 累计?累计了什么?
accumulate = max(1,np.interp(ni, xi, [1, nbs / total_batch_size]).round())
# 这是在干嘛?
# for j, x in enumerate(opt):
#通过_set_hyper调整lr和mom
# optimizer._set_hyper("learning_rate", np.interp(ni, xi, 0.0, hyp['lr0'] * lf(epoch)))
optimizer._set_hyper("momentum", np.interp(ni, xi, [hyp['warmup_momentum'], hyp['momentum']]))
# 是否要做缩放
if opt.multi_scale:
sz = random.randrange(imgsz * 0.5, imgsz * 1.5 * gs) // gs * gs
sf = sz / max(imgs.shape[2:]) # scale factor
if sf != 1:
ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]]# new shape (stretched to gs-multiple)
imgs = tf.image.resize(imgs, ns , tf.image.ResizeMethod.BILINEAR,False)
#Forward
with tf.GradientTape() as gt:
pred = model(imgs)
loss, loss_items = compute_loss(pred,targets,model)
print(loss_items,end='')
print(loss)
if rank != -1 :
# gradient averaged between devices in DDP mode
# ddp模式下需要配置设备间的梯度平均值
loss *= opt.world_size
grads = gt.gradient(loss,model.trainable_variables)
optimizer.apply_gradients((grad, var) for (grad, var) in zip(grads,model.trainable_variables) if grad is not None) # 优化函数应用梯度进行优化
# optimizer.apply_gradients(zip(grads,model.trainable_variables)) # 不要使用trainable_variables
#?什么玩意
# tensorflow的混合精度学习。。后面再琢磨,现在不想
# scaler.scale(loss).backward()
# Optimize
# if ni % accumulate == 0 :
# scaler.step(optimizer)
# scaler.update()
# optimizer.zero_grad()
# if ema :
# ema.update(model)
# Print
if rank in [-1,0]:
mloss = (mloss * i + loss_items) / (i+1) # 更新当前平均loss
mem = '%.3G' % ( pynvml.nvmlDeviceGetMemoryInfo(handle).used / 1E9 if useGpu else 0)
s = ('%10s' * 2 + '%10.4g' * 7) % (
'%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1] if opt.format == 'NCHW' else imgs.shape[1])
pbar.set_description(s)
# Plot
if plots and ni < 3:
f = save_dir / f'train_batch{ni}.jpg' # filename
f = ''
Thread(target=plot_images, args=(imgs, targets, paths, f, opt.format), daemon=True).start()
# if tb_writer:
# tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch)
# tb_writer.add_graph(model, imgs) # add model to tensorboard
elif plots and ni == 3 and wandb:
wandb.log({"Mosaics": [wandb.Image(str(x), caption=x.name) for x in save_dir.glob('train*.jpg')]})
# 单次训练完成,清理掉img,paths和 targets
del imgs, targets, paths
# # 跑完一次epoch记得gc叫出来一下,做个深度清理
gc.collect()
model.save("mask_detector")