Source code for models.moe_adapters_utils.adapter
# --------------------------------------------------------
# References:
# https://github.com/jxhe/unify-parameter-efficient-tuning
# --------------------------------------------------------
import math
import torch
import torch.nn as nn
[docs]
class Adapter(nn.Module):
def __init__(self,
d_model=None,
bottleneck=None,
dropout=0.0,
init_option="lora",
adapter_scalar="1.0",
adapter_layernorm_option="in"):
super().__init__()
self.n_embd = d_model if d_model is None else d_model
self.down_size = bottleneck
#_before
self.adapter_layernorm_option = adapter_layernorm_option
self.adapter_layer_norm_before = None
if adapter_layernorm_option == "in" or adapter_layernorm_option == "out":
self.adapter_layer_norm_before = nn.LayerNorm(self.n_embd)
if adapter_scalar == "learnable_scalar":
self.scale = nn.Parameter(torch.ones(1))
else:
self.scale = float(adapter_scalar)
self.down_proj = nn.Linear(self.n_embd, 64)
self.non_linear_func = nn.ReLU()
self.up_proj = nn.Linear(self.down_size, self.n_embd)
self.dropout = dropout
if init_option == "bert":
raise NotImplementedError
elif init_option == "lora":
with torch.no_grad():
nn.init.kaiming_uniform_(self.down_proj.weight, a=math.sqrt(5))
nn.init.zeros_(self.up_proj.weight)
nn.init.zeros_(self.down_proj.bias)
nn.init.zeros_(self.up_proj.bias)
[docs]
def forward(self, x, add_residual=True, residual=None):
residual = x if residual is None else residual
if self.adapter_layernorm_option == 'in': # none
x = self.adapter_layer_norm_before(x)
down = self.down_proj(x)
down = self.non_linear_func(down)
down = nn.functional.dropout(down, p=self.dropout, training=self.training)
up = self.up_proj(down)
up = up * self.scale
if self.adapter_layernorm_option == 'out': # none
up = self.adapter_layer_norm_before(up)
if add_residual:
output = up + residual
else:
output = up
return output