API tutorial¶
Expression Building¶
(This tutorial is tested on DyNet 2.0.4+ and Python 2.7.)
If you find any issues while running, please try to restart the kernel and run it again. :)
[ ]:
from __future__ import print_function
from __future__ import division
from __future__ import absolute_import
# Note: please import dynet_config before import dynet
import dynet_config
# set random seed to have the same result each time
dynet_config.set(random_seed=0)
import dynet as dy
import numpy as np
## ==== Create a new computation graph
# (There is a single global computation graph that is used at any point.
# dy.renew_cg() clears the current one and starts a new one)
dy.renew_cg();
Create Expressions¶
Expressions are used as an interface to the various functions that can be used to build DyNet computation graphs.
[ ]:
# create a scalar expression.
value = 5.0
x = dy.scalarInput(value)
[ ]:
# create a vector expression.
dimension = 3
v = dy.vecInput(dimension)
v.set([1,2,3])
[ ]:
# create a matrix expression from a list
mat1 = dy.inputTensor([[1,2], [3,4]]) # Row major
# or, using a numpy array
mat2 = dy.inputTensor(np.array([[1,2], [3,4]]))
mat3 = dy.inputTensor(np.zeros((2,3)))
[ ]:
# create a vector/matrix expression of special values
# Different from other toolkits such as TensorFlow or PyTorch.
# DyNet has a special "batch" dimension, see here for more
# details: http://dynet.readthedocs.io/en/latest/minibatch.html
# zeros
dim = 5
batch_size = 3
e = dy.zeros(dim, batch_size=batch_size)
print('zeors of dim {} and batch_size {}:\n{}'.format(dim, batch_size, e.npvalue()))
# ones
e = dy.ones(dim, batch_size=batch_size)
print('ones of dim {} and batch_size {}:\n{}'.format(dim, batch_size, e.npvalue()))
# constant
val = 2
e = dy.constant(dim, val, batch_size=batch_size)
print('constant {} of dim {} and batch_size {}:\n{}'.format(val, dim, batch_size, e.npvalue()))
# random_normal
mean = 0
stddev = 1.0
e = dy.random_normal(dim, mean=mean, stddev=stddev, batch_size=batch_size)
print('A {} dim random_normal of mean {} and stddev {} and batch_size {}:\n{}'.format(dim, mean, stddev, batch_size, e.npvalue()))
# random_bernoulli
p = 0.3 # The p in Bernoulli distribution
scale = 2.0 # Scaling factor to apply to the sampled tensor (default: (1.0))
e = dy.random_bernoulli(dim, p=p, scale=scale, batch_size=batch_size)
print('A {} dim random bernoulli distribution of p={} and scale={}, batch_size={}:\n{}'.format(dim, p, scale, batch_size, e.npvalue()))
# random_uniform
left = -1
right = 1
e = dy.random_uniform(dim, left=left, right=right, batch_size=batch_size)
print('A {} dim random uniform distribution of left={} and right={}, batch_size={}:\n{}'.format(dim, left, right, batch_size, e.npvalue()))
# random_gumbel
# Create a vector distributed according to a Gumbel distribution with the specified parameters.
# (Currently only the defaults of mu=0.0 and beta=1.0 supported.
mu = 0.0
beta = 1.0
e = dy.random_gumbel(dim, mu=mu, beta=beta, batch_size=batch_size)
print('A {} dim random gumbel distribution of mu={} and beta={}, batch_size={}:\n{}'.format(dim, mu, beta, batch_size, e.npvalue()))
[ ]:
## ==== Calculate the value of an expression.
# This will run the forward step of the neural network.
print(mat1.value())
print(mat1.npvalue()) # as numpy array
print(v.vec_value()) # as vector, if vector
print(x.scalar_value()) # as scalar, if scalar
print(x.value()) # choose the correct one
Create Parameters¶
Parameters are things need to be trained. In contrast to a system like Torch where computational modules may have their own parameters, in DyNet parameters are just parameters.
[ ]:
# Parameters are things we tune during training.
# Usually a matrix or a vector.
# First we create a parameter collection and add the parameters to it.
m = dy.ParameterCollection()
W = m.add_parameters((8,8)) # an 8x8 matrix, return an expr
b = m.add_parameters(8) # an 8x1 vector, return as expr
It should be noticed that in DyNet 2.0.4+, the dy.parameters() is depecated so explicitly adding parameters to the computation graph is no longer necessary. Any used parameter will be added automatically.
[ ]:
# There are several ways to initial parameters
# Specifiying parameter initialization
scale, mean, stddev = 1, 0, 1
# Creates 3x5 matrix filled with 0 (or any other float)
p1 = m.add_parameters((3,5), init=0)
# Creates 3x5 matrix initialized with U([-scale, scale])
p2 = m.add_parameters((3,5), init='uniform', scale=scale)
# Creates 3x5 matrix initialized with N(mean, stddev)
p3 = m.add_parameters((3,5), init='normal', mean=mean, std=stddev)
# Creates 5x5 identity matrix
p4 = m.add_parameters((5,5), init='identity')
# Creates 3x5 matrix with glorot init
p5 = m.add_parameters((3,5), init='glorot')
p6 = m.add_parameters((3,5)) # By default, the init = 'glorot'
# Creates 3x5 matrix with he init
p7 = m.add_parameters((3,5), init='he')
# Creates 3x5 matrix from a numpy array (size is inferred)
p8 = m.add_parameters((3,5), np.ones((3,5)))
Create LookupParameters¶
LookupParameters represents a table of parameters. They are used to embed a set of discrete objects (e.g. word embeddings). They can be sparsely updated.
[ ]:
## ===== Lookup parameters
# Similar to parameters, but are representing a "lookup table" that maps numbers to vectors.
# These are often used for things like word embeddings.
# For example, this will have VOCAB_SIZE rows, each of DIM dimensions.
VOCAB_SIZE = 100
DIM = 10
lp = m.add_lookup_parameters((VOCAB_SIZE, DIM))
[ ]:
# Create expressions from lookup parameters.
e5 = dy.lookup(lp, 5) # create an Expression from row 5.
e5 = lp[5] # same
e5c = dy.lookup(lp, 5, update=False) # as before, but don't update when optimizing.
e45 = dy.lookup_batch(lp, [4, 5]) # create a batched Expression from rows 4 and 5.
e45 = lp.batch([4, 5])
print('e45 dim:', e45.dim())
e0_9 = dy.lookup_batch(lp, range(10)) # create a batched Expression from rows 0 to 9
e0_9 = lp.batch(range(10))
print('e0_9 dim:', e0_9.dim())
e5.set(10) # now the e5 expression contains row 10
print('e5 dim after applying set method', e5.dim())
print(e5.value())
# We can check if it is actually containing row 10
e10 = lp[10]
print(e5.value() == e10.value())
[ ]:
# Similar to Parameters, we have several ways to
# initialize LookupParameters.
scale, mean, stddev = 1, 0, 1
# Creates 3x5 matrix filled with 0 (or any other float)
lp1 = m.add_lookup_parameters((3,5), init=0)
# Creates 3x5 matrix initialized with U([-scale, scale])
lp2 = m.add_lookup_parameters((3,5), init='uniform', scale=scale)
# Creates 3x5 matrix initialized with N(mean, stddev)
lp3 = m.add_lookup_parameters((3,5), init='normal', mean=mean, std=stddev)
# Creates 5x5 identity matrix
lp4 = m.add_lookup_parameters((5,5), init='identity')
# Creates 3x5 matrix with glorot init
lp5 = m.add_lookup_parameters((3,5), init='glorot')
lp6 = m.add_parameters((3,5)) # By default, the init = 'glorot'
# Creates 3x5 matrix with he init
lp7 = m.add_lookup_parameters((3,5), init='he')
# Creates 3x5 matrix from a numpy array (size is inferred)
lp8 = m.add_lookup_parameters((3,5), np.ones((3,5)))
# Creates 3x5 matrix from a numpy array (size is inferred)
More Expression Manipulation¶
DyNet provides hundreds of operations on Expressions. The user can manipulate Expressions, or build complex Expression easily.
[ ]:
# Fist we create some vector Expressions.
e1 = dy.vecInput(4)
e1.set([1, 2, 3, 4])
e2 = dy.vecInput(4)
e2.set([5, 6, 7, 8])
# Concatenate list of expressions to a single batched expression.
# All input expressions must have the same shape.
e_batch = dy.concatenate_to_batch([e1, e2])
mat1 = dy.inputTensor(np.array([[1, 2], [3, 4], [5, 6], [7, 8]])) # A 4x2 matrix
mat2 = dy.inputTensor(np.array([[1, 0], [0, 1]])) # A 2x2 matrix
[ ]:
# Basic Math Operations
# Add
e = e1 + e2 # Element-wise addition
# Minus
e = e2 - e1 # Element-wise minus
# Negative
e = -e1 # Should be [-1.0, -2.0, -3.0, -4.0]
# Multiply
e = e1 * dy.transpose(e1) #It's Matrix multiplication (like e1.dot(e2) in numpy)
mat = mat1 * mat2
# Dot product
e = dy.dot_product(e1, e2) # dot product = sum(component-wise multiply)
# Component-wise multiply
e = dy.cmult(e1, e2)
# Component-wise division
e = dy.cdiv(e1, e2)
# Column-wise addition
# colwise_add(x, y)
# x: An MxN matrix
# y: A length M vector
mat = dy.colwise_add(mat1, e1) # column-wise addition
# Useful math operations
# abs()
e = dy.abs(e1)
# cube()
# Elementwise cubic
e = dy.cube(e1)
# exp()
e = dy.exp(e1)
# pow()
# For each element in e1, calculate e1^{y}
e = dy.pow(e1, dy.inputTensor([2]))
e_ = dy.square(e1)
assert e.value() == e_.value()
# bmin()
# Calculate an output where the ith element is min(x_i, y_i)
e = dy.bmin(e1, e2)
assert e.value() == e1.value()
# bmax()
# Calculate an output where the ith element is max(x_i, y_i)
e = dy.bmax(e1, e2)
assert e.value() == e2.value()
# sin()
e = dy.sin(e1)
# cos()
e = dy.cos(e1)
# tan()
e = dy.tan(e1)
# asin()
e = dy.asin(e1)
# acos()
e = dy.acos(e1)
# atan()
e = dy.atan(e1)
# sinh()
e = dy.sinh(e1)
# cosh()
e = dy.cosh(e1)
# tanh()
e = dy.tanh(e1)
# asinh()
e = dy.asinh(e1)
# acosh()
e = dy.acosh(e1)
# atanh
e = dy.atanh(e1)
# square()
e = dy.square(e1)
# sqrt()
e = dy.sqrt(e1)
[ ]:
# Matrix manipulation
# Reshape
new_dimension = (2, 2)
e = dy.reshape(e1, new_dimension) # Col major
print('reshape a vector:\n', e.value())
# Transpose
e = dy.transpose(e1)
print('e1 dimension:', e1.dim())
print('e1 transpose dimension', e.dim())
# inverse
# Not implemented on GPU yet.
e = dy.inverse(dy.inputTensor([[1, 3], [3, 1]]))
print('Inverse a matrix:\n', e.npvalue())
# logdet
# Not implemented on GPU yet
e = dy.logdet(dy.inputTensor([[1, 0], [0, 2]]))
print('logdet diag(1,2) is log(2):\n', e.npvalue())
# trace_of_product
# Not implemented on GPU yet
diag_12 = dy.inputTensor([[1, 0], [0, 2]])
e = dy.trace_of_product(diag_12, diag_12)
# or on matrix
e = dy.trace_of_product(mat1, mat1)
# circ_conv
sig_1 = dy.inputTensor([1,2,1,0])
sig_2 = dy.inputTensor([0,1,1,1])
e = dy.circ_conv(sig_1, sig_2)
# circ_corr
e = dy.circ_corr(sig_1, sig_2)
[ ]:
# Other Per-element unary functions.
# erf()
# Elementwise calculation of the Gaussian error function
e = dy.erf(e1)
# log()
e = dy.log(e1)
# log_sigmoid()
e = dy.log_sigmoid(e1)
# lgamma()
# Definition of gamma function ca be found here
# https://en.wikipedia.org/wiki/Gamma_function
e = dy.lgamma(e1)
e_ = dy.log(dy.inputTensor([1, 1, 2, 6]))
assert e.value() == e_.value()
# sigmoid()
e = dy.logistic(e1) # Sigmoid(x)
# rectify()
# Rectifier (or ReLU, Rectified Linear Unit)
e = dy.rectify(e1) # Relu (= max(x,0))
# elu()
# Exponential Linear Unit (ELU)
# Definition can be found here:
# https://en.wikipedia.org/wiki/Rectifier_(neural_networks)#ELUs
e = dy.elu(e1)
# selu()
# Scaled Exponential Linear Unit (SELU)
# Definition can be found here:
# https://arxiv.org/abs/1706.02515
e = dy.selu(e1)
# silu()
# Sigmoid Linear Unit / sigmoid-weighted linear unit
# SILU / SiL / Swish
# Definition can be found here:
# https://openreview.net/pdf?id=Bk0MRI5lg
e = dy.silu(e1)
# sparsemax()
# **Note:** This function is not yet implemented on GPU.
# Similar to softmax, but induces sparse solutions where
# most of the vector elements are zero.
e = dy.sparsemax(e1)
# softsign()
e = dy.softsign(e1) # x/(1+|x|)
# softmax()
e = dy.softmax(e1)
print('softmax result:', e.value())
# log_softmax
# logsoftmax = logits - log(reduce_sum(exp(logits), dim))
# restrict is a set of indices. if not empty, only entries
# in restrict are part of softmax computation, others get -inf.
e_log_softmax = dy.log_softmax(e1)
e_log_softmax = dy.log_softmax(e1, restrict=[0,1,2])
print('log_softmax result', e_log_softmax.value())
# constrained_softmax()
# **Note:** This function is not yet implemented on GPU.
# similar to softmax, but defines upper bounds for the resulting probabilities.
e = dy.constrained_softmax(e1, dy.inputTensor([0.01, 0.05, 0.10, 0.55]))
print('constrained_softmax result', e.value())
[ ]:
# Picking values from vector expressions
k, v = 1, 3
# Pick one element from a vector or matrix
# similar to python's e1[k] for list.
# k can be negative, which has exactly the same behavior
# as it is in python
e = dy.pick(e1, k)
print('The {} element of vector is {}'.format(k+1, e.value())) # index starts from 0
# which is also equivalent to:
e = e1[k]
# k can be negative. -1 means the last element
e = e1[-1]
print(e.value())
mat = dy.pick(mat1, k)
print('The {} element of matrix mat1 is {}'.format(k+1, mat.value()))
# which is equivalent to:
mat = mat1[k]
# Pick several elements from a vector or matrix
# similar to python's e1[k:v] for lists.
# e1 is an Expression, k, v are integers.
# Important: v should not exceed the e1's dimension.
e = dy.pickrange(e1, k, v)
print('Pick range[k, v) from a vector', e.value())
# which is also equivalent to:
e = e1[k:v]
e = e1[:v] # similar to python, you can neglect k
e = e1[:] # or even both k and v
# ERROR: Don't try this
# e = e1[0:10], the v value should not exceed the dimension.
mat = dy.pickrange(mat1, k, v)
print('Pick range[k, v) from a matrix:\n', mat.value())
# pickneglogsoftmax
# which is equivalent to: dy.pick(-dy.log(dy.softmax(e1)), k)
e = dy.pickneglogsoftmax(e1, k)
e_ = dy.pick(-dy.log(dy.softmax(e1)), k)
print('{0:.6f} and {0:6f}'.format(e.value(), e_.value()))
# pickneglogsoftmax_batch
# similar to pickneglogsoftmax, this is negative softmax log likelihood on a batch
# The difference is, a list of intergers is required for True classes.
e = dy.pickneglogsoftmax_batch(e1, [k])
[ ]:
# Selecting vectors from matrix Expressions
# select_rows
# works similar to pickrange
e = dy.select_rows(mat1, [0,1])
e_ = mat1[:2]
assert np.all(e.value() == e_.value())
# select_cols
e = dy.select_cols(mat1, [0])
[ ]:
# Expressions concatenation & other useful manipuulations
# This performs an elementwise sum over all the expressions included.
# All expressions should have the same dimension.
e = dy.esum([e1, e2])
# which is equivalent to:
e_ = e1 + e2
assert e.value() == e_.value()
# This performs an elementwise average over all the expressions included.
# All expressions should have the same dimension.
e = dy.average([e1, e2])
# which is equivalent to:
e_ = (e1 + e2)/2
assert e.value() == e_.value()
# Concate vectors/matrix column-wise
# All expressions should have the same dimension.
# e1, e2,.. are column vectors. return a matrix. (sim to np.hstack([e1,e2,...])
e = dy.concatenate_cols([e1, e2])
print('vector and vector concatenate_cols:\n', e.value())
mat = dy.concatenate_cols([mat1, e2])
print('mattix and vector concatenate_cols:\n', mat.value())
# Concate vectors/matrix
# All expressions should have the same dimension.
# e1, e2,.. are column vectors. return a matrix. (sim to np.hstack([e1,e2,...])
e = dy.concatenate([e1, e2])
print('vector concatenate:', e.value())
mat = dy.concatenate([mat2, mat2])
print('matrix concatenate:\n',mat.value())
# affine transform
e0 = dy.vecInput(2)
e0.set([-1, 0])
e = dy.affine_transform([e1,mat1,e0])
print('affine_transform:', e.value())
# sum_elems
# Sum all elements
e = dy.sum_elems(mat1)
print('sum_elems:', e.value())
# sum_dim
# sum_dim(Expression x, list d, bool b=False, unsigned n=0)
# d (list): Dimensions along which to reduce
# b (bool): Whether to include batch dimension
# Sum along an arbitrary dimension
# Here, mat1 has dimension ((4, 2), 1),
e = dy.sum_dim(mat1, [0])
print('sum over the 0th dimension', e.value())
e = dy.sum_dim(mat1, [1])
print('sum over the 1st dimension', e.value())
# sum_batches
# Sum an expression that consists of multiple minibatches into
# one of equal dimension but with only a single minibatch.
# This is useful for summing loss functions at the end of minibatch training.
e = dy.sum_batches(e1)
# cumsum
# usage: cumsum(Expression x, unsigned d=0)
# Computes the cumulative sum along an arbitrary dimension.
# d (int): Dimension along which to compute the cumulative sums (default: 0)
e = dy.cumsum(mat1, 1)
print('cumsum:\n', e.value())
# mean_elems
# Computes the mean of all the elements of each minibatch.
e = dy.mean_elems(mat1)
print('mean_elems', e.value()) # will return 4.5 in this case.
# mean_dim
# usage: mean_dim(Expression x, list d, bool b, unsigned n=0)
# x (dynet.Expression): Input expression
# d (list): Dimensions along which to reduce
# b (bool): Whether to include batch dimension
# n (int): If > 0, overwrite the n in the equation by this value, useful for masking
# Computes the mean along an arbitrary dimension.
e = dy.mean_dim(mat1, [0], True)
print('mean_dim:', e.value())
e_ = dy.mean_dim(mat1, [1], True)
print('mean_dim:', e_.value())
# mean_batches
# Mean along the batch dimension
e = dy.mean_batches(mat1)
# std_elems
# Computes the standard deviation of all the elements of each minibatch
e = dy.std_elems(mat1)
print('std_elems:', e.value())
# std_dim
# usage: std_dim(Expression x, list d, bool b, unsigned n=0)
# x (dynet.Expression): Input expression
# d (int): Dimensions along which to reduce
# b (bool): Whether to include batch dimension
# n (int): If > 0, overwrite the n in the equation by this value, useful for masking
# Computes the standard deviation along arbitrary dimensions.
e = dy.std_dim(mat1, [0], True)
print('std_dim', e.value())
# std_batches
# Standard deviation along the batch dimension
e = dy.std_batches(mat1)
# moment_elems
# Statistical moment of elements of the tensor
# usage: moment_elems(Expression x, unsigned r)
# x (dynet.Expression): Input expression
# r (int): Moment order
e = dy.moment_elems(mat1, 1)
print('moment_elems:', e.value())
# moment_dim
# Statistical moment along an arbitrary dimension
# usage: moment_dim(Expression x, list d, unsigned r, bool b, unsigned n=0)
# x (dynet.Expression): Input expression
# d (list): Dimensions along which to reduce
# r (int): Moment order
# b (bool): Whether to include batch dimension
# n (int): If > 0, overwrite the n in the equation by this value, useful for masking
e = dy.moment_dim(mat1, [0], 2, False)
print('moment_dim:', e.value())
# moment_batches
# Statistical moment along the batch dimension
e = dy.moment_batches(mat1, 2)
# fold_rows
# usage: fold_rows(Expression x, unsigned nrows=2)
e = dy.fold_rows(mat1)
print('fold_rows:\n', e.value())
DyNet in Neural Networks¶
This part contains Neural Networks related issues.
[ ]:
# Noise and Dropout Expressions
# Add a noise to each element from a gausian distribution
# with standard-dev = stddev
stddev = 0.1
e = dy.noise(e1, stddev)
print('noise for stddev=0.1:', e.value())
# Apply dropout to the input expression
# There are two kinds of dropout methods
# (http://cs231n.github.io/neural-networks-2)
# Dynet implement the Inverted dropout where dropout with prob p
# and scaling others by 1/p at training time, and do not need
# to do anything at test time.
p = 0.5
e = dy.dropout(e1, p) # apply dropout with probability p
print('dropout at probability 0.5:', e.value()) # It should be [2.0, 4.0, 6.0, 0.0], the last element is dropped out and the rest are scaled
# If we set p=1, everything will be dropped out
e = dy.dropout(e1, 1)
print('dropout at probbability 1:', e.value()) # Should be [nan, nan, ...]
# If we set p=0, everything will be kept
e = dy.dropout(e1, 0)
assert e.value() == e1.value()
[ ]:
# Loss Functions
# DyNet provides several ways to calculate "distance"
# between two expressions of the same dimension
# This is square_distance, defined as
# sum(square of(e1-e2)) for all elements
# in e1 and e2.
# Here e1 is a vector of [1,2,3,4]
# And e2 is a vector of [5,6,7,8]
# The square distance is sum((5-1)^2 + (6-2)^2+...)
e = dy.squared_distance(e1, e2)
print('squared_distance:', e.value())
# This is the l1_distance, defined as
# sum (abs(e1-e2)) for all elements in
# e1 and e2.
e = dy.l1_distance(e1, e2)
print('l1_distance:', e.value())
# This is the huber_distance, definition
# found here. (https://en.wikipedia.org/wiki/Huber_loss)
# The default threhold (delta) is 1.345.
# Here e1 is a vector of [1,2,3,4]
# And e2 is a vector of [5,6,7,8]
# because for each pair-wised element in
# e1 and e2, the abs(e1-e2)=4>delta=1.345,
# so the output is sum(delta*(abs(4)-1/2*delta))
e = dy.huber_distance(e1, e2, c=1.345)
print('huber distance:', e.value())
# Binary logistic loss function
# This is similar to cross entropy loss function
# e1 must be a vector that takes values between 0 and 1
# ty must be a vector that takes values between 0 and 1
# e = -(ty * log(e1) + (1 - ty) * log(1 - e1))
ty = dy.vecInput(4)
ty.set([0, 0.5, 0.5, 1])
e_scale = ty = dy.vecInput(4)
e_scale.set([0.5, 0.5, 0.5, 0.5])
e = dy.binary_log_loss(e_scale, ty)
print('binary_log_loss:', e.value())
# Te binary_log_loss is equivalent to the following:
e_equl = -(dy.dot_product(ty, dy.log(e_scale)) + dy.dot_product((dy.inputTensor([1,1,1,1]) - ty), dy.log(dy.inputTensor([1,1,1,1]) - e_scale)))
assert e_equl.value() == e.value()
# pairwise_rank_loss
# e1 is row vector or scalar
# e2 is row vector or scalar
# m is number
# e = max(0, m - (e1 - e2))
e = dy.pairwise_rank_loss(dy.transpose(e1), dy.transpose(e2), m=1.0) # Row vector needed, so we transpose the vector.
print('pairwise_rank_loss wrt e1:', e.value()) # Expect [[5. 5. 5. 5.]]
e = dy.pairwise_rank_loss(dy.transpose(e2), dy.transpose(e1), m=1.0) # Row vector needed, so we transpose the vector.
print('pairwise_rank_loss wrt e2:', e.value()) # Expect [[0. 0. 0. 0.]]
# poisson_loss
# The negative log probability of y according to a Poisson distribution with parameter x.
# Useful in Poisson regression where, we try to predict the parameters of a Possion
# distribution to maximize the probability of data y.
# usage: poisson_loss(Expression log_lambda, unsigned x)
# log_lambda (dynet.Expression): The log of the Poisson distribution's lambda
# x (int): The target value
e_scalar = dy.scalarInput(2)
e = dy.poisson_loss(e_scalar, 1)
print('poisson_loss:', e.value())
# hinge, aka SVM loss
# usage: hinge(Expression x, unsigned v, float m=1.0)
# x (dynet.Expression): Input scores
# v (int): True class
# m (float): The margin
# here we have e1.value() equals to [1,2,3,4] and the true class is set to be 3 (index 2)
# and we manually set m to be zero.
# so, for each element in e1, we will calculate element-true_class_value+m
# in this case, it is [max(0, 1-3+0), max(0, 2-3+0), max(0, 3-3+0), max(0, 4-3+0)]
# and finally sum it uo, which is 1.
e = dy.hinge(e1, 2, m=0)
print('hinge loss:', e.value())
# binary_log_loss
# The log loss of a binary decision according to the sigmoid sigmoid function.
e_scale1 = dy.vecInput(3)
e_scale1.set([0.1, 0.2, 0.7])
e_scale2 = dy.vecInput(3)
e_scale2.set([0.5, 0.2, 0.3])
e = dy.binary_log_loss(e_scale1, e_scale2)
[ ]:
# Convolutions
# DyNet can do convolutions similar to PyTorch.
# First we mock an image and a filter
# mat is a 3D tensor of dim{4,4,3}
# kernel is a 4d Tensor of shape {2,2,3,1}
mat = dy.inputTensor(np.array([[[1,2,1], [0,1,2], [0,0,1], [0,1,0]], [[1,0,2], [0,0,0], [1,1,1], [2,2,2]], [[0,1,2], [1,1,0], [0,0,1], [2,2,1]], [[2,2,0], [2,1,2], [2,2,1], [1,1,0]]]))
kernel = dy.inputTensor(np.array([[[[1], [0], [2]], [[1], [2], [0]]], [[[0], [1], [0]], [[2], [1], [1]]]]))
print(mat.dim(), kernel.dim())
# filter1d_narrow()
# usage: e = dy.filter1d_narrow(e1, e2)
# e1: Expression of n x s
# e2: Expression of n x d
# This function will calculate the convolution along each dimension.
# For example, 0.8 = 1*0 + 2*0.1 + 3*0.2 + 0*0.5
# 1.6 = 2*0 + 3*0.1 + 4*0.2 + 1*0.5
# 1.4 = 3*0 + 4*0.1 + 0*0.2 + 2*0.5
e_input = dy.inputTensor([[1,2,3], [2,3,4], [3,4,0], [0,1,2]])
e_filter = dy.inputTensor([[0], [0.1], [0.2], [0.5]])
e = dy.filter1d_narrow(e_input, e_filter)
print('filter1d_narrow', e.npvalue()) # expect [[0.8 1.6 1.4]]
# conv2d
# This is 2D convolution operator without bias parameters.
# dy.conv2d(Expression x, Expression f, vector[unsigned] stride, bool is_valid = True)
# x: The input feature maps: (H x W x Ci) x N (ColMaj), 3D tensor with an optional batch dimension
# f: 2D convolution filters: H x W x Ci x Co (ColMaj), 4D tensor
# stride: the row and column strides in a list
# is_valid: padding method. True for 'Valid' and False for 'Same'.
# 'Valid': output size shrinks by `filter_size - 1`, and the filters always sweep at valid
# positions inside the input maps. No padding needed.
# 'Same': output size is the same with input size. To do so, one needs to pad the input so
# the filter can sweep outside of the input maps.
e = dy.conv2d(mat, kernel, stride=[1, 1], is_valid=True)
print('con2d without bias:\n', e.npvalue())
# conv2d_bias
# This is 2D convolution operator with bias parameters.
# dy.conv2d_bias(Expression x, Expression f, Expression b, vector[unsigned] stride, bool is_valid = True)
# b: A vector representing bias. (Ci x 1)
bias = dy.inputTensor([1])
e = dy.conv2d_bias(mat, kernel, bias, stride=[1, 1], is_valid=True)
print('conv2d with bias:\n', e.npvalue())
# maxpooling2d
# Usage: maxpooling2d(Expression x, vector[unsigned] ksize, vector[unsigned] stride, bool is_valid = True)
# x: The input feature maps: (H x W x Ci) x N (ColMaj), 3D tensor with an optional batch dimension
# ksize (list): the max pooling 2d window size
# stride (list): the row and column strides
# Here we use a [2,2] maxpooling window and stride is 1 in rows and 2 in cols.
# Given a ((4, 4, 3), 1) matrix , we expect a (3,2,3) * 1 output
e = dy.maxpooling2d(mat, [2, 2], [1, 2], is_valid=True)
# kmax_pooling
# Usage: kmax_pooling(Expression x, unsigned k, unsigned d=1
# Select out k maximum values along a given dimension, in the same order as they appear.
# This will result in the size of the given dimension being changed to k.
# unsigned k (dynet.Expression): Number of maximum values to retrieve along the given dimension
# unsigned d (int): Dimension on which to perform kmax-pooling (default: (1))
# Given a ((4, 4, 3), 1) matrix:
e = dy.kmax_pooling(mat, 2, 0) # we expect a ((2, 4, 3), 1) output.
e = dy.kmax_pooling(mat, 2, 1) # we expect a ((4, 2, 3), 1) output.
e = dy.kmax_pooling(mat, 2, 2) # we expect a ((4, 4, 2), 1) output.
# kmh_ngram
# usage: kmh_ngram(Expression x, unsigned v)
# x should be an Expression.
# v should be an int and should be less than number of cols
e = dy.kmh_ngram(mat1, 2)
print('kmh_ngram\n',e.npvalue())
[ ]:
# Backpropagation and Gradient related
# nobackprop
# This node has no effect on the forward pass, but prevents gradients from flowing backward during the backward pass.
# This is useful when there's a subgraph for which you don't want loss passed back to the parameters.
e = dy.nobackprop(e1)
# flip_gradient
# This node has no effect on the forward pass, but takes negative on backprop process.
# This operation is widely used in adversarial networks.
e = dy.flip_gradient(e1)
# scale_gradient
# This node scales the gradient by a constant on backprop, with no effect on the forward pass
e = dy.scale_gradient(e1, lambd=2)
# argmax
# This node takes an input vector x and returns a one hot vector y.
# There are two gradient modes for this operation:
# "zero_gradient": is the standard argmax operation. Note that this almost everywhere differentiable and its gradient is 0. **It will stop your gradient**
# "straight_through_gradient": Its forward pass is the same as the argmax operation, but its gradient is the same as the identity function.
# Find more information here at: https://arxiv.org/abs/1308.3432
e = dy.argmax(dy.inputTensor([1,2,3]), gradient_mode='zero_gradient')
[ ]:
# Normalization
# Squared norm
e = dy.squared_norm(e1)
# l2_norm
e = dy.l2_norm(e1)
# layer_norm
# Usage: layer_norm(Expression x, Expression g, Expression b)
# x (dynet.Expression): Input expression (possibly batched)
# g (dynet.Expression): Gain (same dimension as x, no batch dimension)
# b (dynet.Expression): Bias (same dimension as x, no batch dimension)
# details can be found here: https://arxiv.org/abs/1607.06450
# mu = average(e1) = 2.5
# delta = sqrt(1/4*sum(e1-mu)) = 1.118033988749895
# e_norm = g/delta * (x-mu) + b
e_norm = dy.layer_norm(e1, e1, e1)
print('layer norm:', e_norm.npvalue())
# weight_norm
# Usage: weight_norm(Expression w, Expression g)
# w (dynet.Expression): Input expression (weight parameter)
# g (dynet.Expression): Gain (scalar expression, usually also a parameter)
e_norm = dy.weight_norm(e1, dy.scalarInput(2))
print('weight norm:', e_norm.npvalue() )
Write your own Neural Networks¶
Now that you have a basic idea about APIs, you can try to write simple Neural Networks of your own.
Here we have some toy models for your reference.
In the first example, we creat a simple Neural Networks which could learn to predict the minimal value given a 1 x 3 input vector.
[ ]:
from __future__ import print_function
from __future__ import division
from __future__ import absolute_import
import dynet_config
# Declare GPU:0 as the default device type
# dynet_config.set_gpu()
import dynet as dy
# reset the global cg
dy.renew_cg()
# create parameter collection
m = dy.ParameterCollection()
# add parameters to parameter collection
pW = m.add_parameters((10,30))
pB = m.add_parameters(10)
lookup = m.add_lookup_parameters((500, 10))
print("Parameters added.")
# create trainer
trainer = dy.SimpleSGDTrainer(m)
# Regularization is set via the --dynet-l2 commandline flag.
# Learning rate parameters can be passed to the trainer:
# alpha = 0.1 # learning rate
# trainer = dy.SimpleSGDTrainer(m, e0=alpha)
# function for graph creation
def create_network_return_loss(inputs, expected_output):
"""
inputs is a list of numbers
"""
dy.renew_cg()
emb_vectors = [lookup[i] for i in inputs]
net_input = dy.concatenate(emb_vectors)
net_output = dy.softmax( (pW*net_input) + pB)
loss = -dy.log(dy.pick(net_output, expected_output))
return loss
# function for prediction
def create_network_return_best(inputs):
"""
inputs is a list of numbers
"""
dy.renew_cg()
emb_vectors = [lookup[i] for i in inputs]
net_input = dy.concatenate(emb_vectors)
net_output = dy.softmax( (pW*net_input) + pB)
return np.argmax(net_output.npvalue())
# train network
for epoch in range(5):
for inp,lbl in ( ([1,2,3],1), ([3,2,4],2) ):
loss = create_network_return_loss(inp, lbl)
print(loss.value()) # need to run loss.value() for the forward prop
loss.backward()
trainer.update()
print('Predicted smallest element among {} is {}:'.format([1,2,3], create_network_return_best([1,2,3])))
You can also rewrite this example in a more fancy way. You create an object which has create_network_return_loss()
and create_network_return_best()
public method.
[ ]:
from __future__ import print_function
from __future__ import division
from __future__ import absolute_import
import dynet_config
# Declare GPU as the default device type
# dynet_config.set_gpu()
import dynet as dy
dy.renew_cg()
# create parameter collection
m = dy.ParameterCollection()
# create a class encapsulating the network
class OurNetwork(object):
# The init method adds parameters to the parameter collection.
def __init__(self, pc):
self.pW = pc.add_parameters((10,30))
self.pB = pc.add_parameters(10)
self.lookup = pc.add_lookup_parameters((500,10))
# the __call__ method applies the network to an input
def __call__(self, inputs):
lookup = self.lookup
emb_vectors = [lookup[i] for i in inputs]
net_input = dy.concatenate(emb_vectors)
net_output = dy.softmax((self.pW * net_input) + self.pB)
return net_output
def create_network_return_loss(self, inputs, expected_output):
dy.renew_cg()
out = self(inputs)
loss = -dy.log(dy.pick(out, expected_output))
return loss
def create_network_return_best(self, inputs):
dy.renew_cg()
out = self(inputs)
return np.argmax(out.npvalue())
# create network
network = OurNetwork(m)
# create trainer
trainer = dy.SimpleSGDTrainer(m)
# train network
for epoch in range(5):
for inp,lbl in ( ([1,2,3],1), ([3,2,4],2) ):
loss = network.create_network_return_loss(inp, lbl)
print(loss.value()) # need to run loss.value() for the forward prop
loss.backward()
trainer.update()
print('Predicted smallest element among {} is {}:'.format([1,2,3], network.create_network_return_best([1,2,3])))
Or alternatively, have the training outside of the network class.
[ ]:
# create network
network = OurNetwork(m)
# create trainer
trainer = dy.SimpleSGDTrainer(m)
# train network
for epoch in range(5):
for inp,lbl in ( ([1,2,3],1), ([3,2,4],2) ):
dy.renew_cg()
out = network(inp)
loss = -dy.log(dy.pick(out, lbl))
print(loss.value()) # need to run loss.value() for the forward prop
loss.backward()
trainer.update()
print(np.argmax(network([1,2,3]).npvalue()))
[ ]: