MSE均方差
Cross Entropy Loss:交叉熵損失
Entropy 熵:
1948年,夏農將統計物理中熵的概念,引申到通道通訊的過程中,從而開創了資訊理論這門學科,把資訊中排除了冗餘後的平均資訊量稱為「資訊熵」。夏農定義的「熵」又被稱為夏農熵或資訊熵,即
其中標記概率空間中所有可能的樣本,表示該樣本的出現機率,是和單位選取相關的任意常數。
針對此問題,熵越大,不確定程度就越大,對於其中資訊量的討論參考知乎。
在資訊學裡資訊量大代表著資料離散範圍小,不確定性小。夏農作為一個資訊學家,他關心的是資訊的正確傳遞,所以資訊熵代表著資訊傳遞的不確定性的大小。所以在資訊學上,使用夏農公式算出來的這個值,在資訊學上叫做資訊熵值,在熵權法中叫做冗餘度值或者叫偏離度值,它的本來含義是指一個確定無疑的資訊源傳送出來的資訊,受到干擾以後,衡量偏離了原始精確資訊的程度。離散度越大,計算得這個值越小,則收到的資訊越不可靠,得到的資訊越小。這個值越大,則收到的資訊越可靠,得到的資訊越多。
在統計學裡,就完全不是這樣。統計學家不認為存在僅有一個的確定無疑的原始資訊。而是認為收到的統計數位都是確信無疑的,只是由於傳送主體可能是很多主體,或者是同一主體不同時間,不同地點,或者是統計渠道不同等等原因,得到了一組具有離散性的數值。在這種情況下,離散性越大,熵值越小,代表著資訊量越大,所以權重越大。
a=torch.full([4],1/4)
#tensor([0.2500, 0.2500, 0.2500, 0.2500])
#計算交叉熵
-(a*torch.log2(a)).sum()
#tensor(2.)
交叉熵在神經網路中作為損失函數,p表示真實標記的分佈,q則為訓練後的模型的預測標記分佈,交叉熵損失函數可以衡量p與q的相似性。交叉熵作為損失函數還有一個好處是使用sigmoid函數在梯度下降時能避免均方誤差損失函數學習速率降低的問題,因為學習速率可以被輸出的誤差所控制。
交叉熵計算:H(p,q)=
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
batch_size=200
learning_rate=0.01
epochs=10
#載入資料集DataLoader(資料位置,batch_size,shuffle是否打亂,num_workers=4:4執行緒處理)
#torchvision.datasets.MNIST(root,train,transform,download) root指下載到的位置,train指是否下載訓練集,transform指對圖片進行轉換後返回,download指是否下載
#torchvision.transforms([transforms.ToTensor(),transforms.Normalize((mean),(std))])
#transforms.ToTensor()做了三件事:1.歸一化/255 2.資料型別轉為torch.FloatTensor 3.shape(H,W,C)->(C,H,W)
#transforms.Normalize((mean),(std)) :用均值和標準差對張量影象進行歸一化
train_loader = torch.utils.data.DataLoader(
datasets.MNIST('../data', train=True, download=True,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(
datasets.MNIST('../data', train=False, transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=batch_size, shuffle=True)
w1, b1 = torch.randn(200, 784, requires_grad=True),\
torch.zeros(200, requires_grad=True)
w2, b2 = torch.randn(200, 200, requires_grad=True),\
torch.zeros(200, requires_grad=True)
w3, b3 = torch.randn(10, 200, requires_grad=True),\
torch.zeros(10, requires_grad=True)
torch.nn.init.kaiming_normal_(w1)
torch.nn.init.kaiming_normal_(w2)
torch.nn.init.kaiming_normal_(w3)
def forward(x):
x = [email protected]() + b1
x = F.relu(x)
x = [email protected]() + b2
x = F.relu(x)
x = [email protected]() + b3
x = F.relu(x)
return x
optimizer = optim.SGD([w1, b1, w2, b2, w3, b3], lr=learning_rate)
criteon = nn.CrossEntropyLoss()
for epoch in range(epochs):
for batch_idx, (data, target) in enumerate(train_loader):
data = data.view(-1, 28*28)
logits = forward(data)
# print(data.shape, target.shape,logits.shape)
loss = criteon(logits, target)
optimizer.zero_grad()
loss.backward()
# print(w1.grad.norm(), w2.grad.norm())
optimizer.step()
if batch_idx % 100 == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))
test_loss = 0
correct = 0
for data, target in test_loader:
data = data.view(-1, 28 * 28)
logits = forward(data)
test_loss += criteon(logits, target).item()
pred = logits.data.max(1)[1]
#print(pred)
correct += pred.eq(target.data).sum()
test_loss /= len(test_loader.dataset)
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
test_loss, correct, len(test_loader.dataset),
100. * correct / len(test_loader.dataset)))
import torch
import torch.nn as nn
import torch.nn.functional as F
x=torch.randn(1,784)
x.shape
#torch.Size([1, 784])
# nn.Linear(輸入、輸出)
layer1 = nn.Linear(784,200)
layer2 = nn.Linear(200,200)
layer3 = nn.Linear(200,10)
x=layer1(x)
x=F.relu(x,inplace=True)
x.shape
#torch.Size([1, 200])
x=layer2(x)
x=F.relu(x,inplace=True)
x.shape
#torch.Size([1, 200])
x=layer3(x)
x=F.relu(x,inplace=True)
x.shape
#torch.Size([1, 10])
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
class MLP(nn.Module):
def __init__(self):
super(MLP,self).__init__()
self.model = nn.Sequential(
nn.Linear(784,200),
nn.ReLU(inplace=True),
nn.Linear(200,200),
nn.ReLU(inplace=True),
nn.Linear(200,10),
nn.ReLU(inplace=True),
)
def forward(self,x):
x=self.model(x)
return x
net= MLP()
optimizer = optim.SGD(net.parameters(),lr=learning_rate)
criteon = nn.CrossEntropyLoss()
其他的啟用函數 SELU、softplus、
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
batch_size=200
learning_rate=0.01
epochs=10
#載入資料集DataLoader(資料位置,batch_size,shuffle是否打亂,num_workers=4:4執行緒處理)
#torchvision.datasets.MNIST(root,train,transform,download) root指下載到的位置,train指是否下載訓練集,transform指對圖片進行轉換後返回,download指是否下載
#torchvision.transforms([transforms.ToTensor(),transforms.Normalize((mean),(std))])
#transforms.ToTensor()做了三件事:1.歸一化/255 2.資料型別轉為torch.FloatTensor 3.shape(H,W,C)->(C,H,W)
#transforms.Normalize((mean),(std)) :用均值和標準差對張量影象進行歸一化
train_loader = torch.utils.data.DataLoader(
datasets.MNIST('../data', train=True, download=True,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(
datasets.MNIST('../data', train=False, transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=batch_size, shuffle=True)
class MLP(nn.Module):
def __init__(self):
super(MLP, self).__init__()
self.model = nn.Sequential(
nn.Linear(784, 200),
nn.LeakyReLU(inplace=True),
nn.Linear(200, 200),
nn.LeakyReLU(inplace=True),
nn.Linear(200, 10),
nn.LeakyReLU(inplace=True),
)
def forward(self,x):
x=self.model(x)
return x
##重點重點!!!
device=torch.device('cuda:0')
net = MLP().to(device)
optimizer = optim.SGD(net.parameters(),lr=learning_rate)
criteon = nn.CrossEntropyLoss().to(device)
for epoch in range(epochs):
for batch_idx, (data, target) in enumerate(train_loader):
data = data.view(-1, 28*28)
data,target = data.to(device),target.to(device)
logits = net(data)
# print(data.shape, target.shape,logits.shape)
loss = criteon(logits, target)
optimizer.zero_grad()
loss.backward()
# print(w1.grad.norm(), w2.grad.norm())
optimizer.step()
if batch_idx % 100 == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))
test_loss = 0
correct = 0
for data, target in test_loader:
data = data.view(-1, 28 * 28)
data, target = data.to(device), target.to(device)
logits = net(data)
test_loss += criteon(logits, target).item()
pred = logits.data.max(1)[1]
#print(pred)
correct += pred.eq(target.data).sum()
test_loss /= len(test_loader.dataset)
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
test_loss, correct, len(test_loader.dataset),
100. * correct / len(test_loader.dataset)))