%pylab inline
import torch
import sys
sys.path.append('..')
sys.path.append('../..')
from data import load
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print('device = ', device)

class ConvNet(torch.nn.Module):
    class Block(torch.nn.Module):
        def __init__(self, n_input, n_output, stride=1):
            super().__init__()
            self.net = torch.nn.Sequential(
              torch.nn.Conv2d(n_input, n_output, kernel_size=3, padding=1, stride=stride, bias=False),
              torch.nn.BatchNorm2d(n_output),
              torch.nn.ReLU(),
              torch.nn.Conv2d(n_output, n_output, kernel_size=3, padding=1, bias=False),
              torch.nn.BatchNorm2d(n_output),
              torch.nn.ReLU()
            )
            self.downsample = None
            if stride != 1 or n_input != n_output:
                self.downsample = torch.nn.Sequential(torch.nn.Conv2d(n_input, n_output, 1, stride=stride),
                                                      torch.nn.BatchNorm2d(n_output))
        
        def forward(self, x):
            identity = x
            if self.downsample is not None:
                identity = self.downsample(x)
            return self.net(x) + identity
        
    def __init__(self, layers=[32,64,128], n_input_channels=3):
        super().__init__()
        L = [torch.nn.Conv2d(n_input_channels, 32, kernel_size=7, padding=3, stride=2, bias=False),
             torch.nn.BatchNorm2d(32),
             torch.nn.ReLU(),
             torch.nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
            ]
        c = 32
        for l in layers:
            L.append(self.Block(c, l, stride=2))
            c = l
        self.network = torch.nn.Sequential(*L)
        self.classifier = torch.nn.Linear(c, 1)
    
    def forward(self, x):
        # Compute the features
        z = self.network(x)
        # Global average pooling
        z = z.mean(dim=[2,3])
        # Classify
        return self.classifier(z)[:,0]
    
model = ConvNet()
model.train()
print( model.training )
model.eval()
print( model.training )

n_epochs = 100

train_data = load.get_dogs_and_cats(resize=(128,128), batch_size=32)
optimizer = torch.optim.Adam(model.parameters())
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, [50,75], gamma=0.1)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'max')

# Transfer the data to a GPU (optional)
model = model.to(device)

# Construct the loss and accuracy functions
loss = torch.nn.BCEWithLogitsLoss()
accuracy = lambda o, l: ((o > 0).long() == l.long()).float()

# Train the network
for epoch in range(n_epochs):
    model.train()
    accuracies = []
    for it, (data, label) in enumerate(train_data):
        # Transfer the data to a GPU (optional)
        data, label = data.to(device), label.to(device)

        # Produce the output
        o = model(data)

        # Compute the loss and accuracy
        loss_val = loss(o, label.float())
        accuracies.extend(accuracy(o, label).detach().cpu().numpy())

        # Take a gradient step
        optimizer.zero_grad()
        loss_val.backward()
        optimizer.step()
        break
#     scheduler.step()
    scheduler.step(np.mean(accuracies))
    print( 'epoch = ', epoch, 'optimizer_lr', optimizer.param_groups[0]['lr'], 'accuracy', np.mean(accuracies))