API tutorial¶

Expression Building¶

(This tutorial is tested on DyNet 2.0.4+ and Python 2.7.)

If you find any issues while running, please try to restart the kernel and run it again. :)

[ ]:

from __future__ import print_function
from __future__ import division
from __future__ import absolute_import

# Note: please import dynet_config before import dynet
import dynet_config
# set random seed to have the same result each time
dynet_config.set(random_seed=0)
import dynet as dy

import numpy as np

## ==== Create a new computation graph
# (There is a single global computation graph that is used at any point.
# dy.renew_cg() clears the current one and starts a new one)
dy.renew_cg();

Create Expressions¶

Expressions are used as an interface to the various functions that can be used to build DyNet computation graphs.

[ ]:

# create a scalar expression.
value = 5.0
x = dy.scalarInput(value)

[ ]:

# create a vector expression.
dimension = 3
v = dy.vecInput(dimension)
v.set([1,2,3])

[ ]:

# create a matrix expression from a list
mat1 = dy.inputTensor([[1,2], [3,4]]) # Row major

# or, using a numpy array
mat2 = dy.inputTensor(np.array([[1,2], [3,4]]))

mat3 = dy.inputTensor(np.zeros((2,3)))

[ ]:

# create a vector/matrix expression of special values
# Different from other toolkits such as TensorFlow or PyTorch.
# DyNet has a special "batch" dimension, see here for more
# details: http://dynet.readthedocs.io/en/latest/minibatch.html

# zeros
dim = 5
batch_size = 3
e = dy.zeros(dim, batch_size=batch_size)
print('zeors of dim {} and batch_size {}:\n{}'.format(dim, batch_size, e.npvalue()))

# ones
e = dy.ones(dim, batch_size=batch_size)
print('ones of dim {} and batch_size {}:\n{}'.format(dim, batch_size, e.npvalue()))

# constant
val = 2
e = dy.constant(dim, val, batch_size=batch_size)
print('constant {} of dim {} and batch_size {}:\n{}'.format(val, dim, batch_size, e.npvalue()))

# random_normal
mean = 0
stddev = 1.0
e = dy.random_normal(dim, mean=mean, stddev=stddev, batch_size=batch_size)
print('A {} dim random_normal of mean {} and stddev {} and batch_size {}:\n{}'.format(dim, mean, stddev, batch_size, e.npvalue()))

# random_bernoulli
p = 0.3 # The p in Bernoulli distribution
scale = 2.0 # Scaling factor to apply to the sampled tensor (default: (1.0))
e = dy.random_bernoulli(dim, p=p, scale=scale, batch_size=batch_size)
print('A {} dim random bernoulli distribution of p={} and scale={}, batch_size={}:\n{}'.format(dim, p, scale, batch_size, e.npvalue()))

# random_uniform
left = -1
right = 1
e = dy.random_uniform(dim, left=left, right=right, batch_size=batch_size)
print('A {} dim random uniform distribution of left={} and right={}, batch_size={}:\n{}'.format(dim, left, right, batch_size, e.npvalue()))

# random_gumbel
# Create a vector distributed according to a Gumbel distribution with the specified parameters.
# (Currently only the defaults of mu=0.0 and beta=1.0 supported.
mu = 0.0
beta = 1.0
e = dy.random_gumbel(dim, mu=mu, beta=beta, batch_size=batch_size)
print('A {} dim random gumbel distribution of mu={} and beta={}, batch_size={}:\n{}'.format(dim, mu, beta, batch_size, e.npvalue()))

[ ]:

## ==== Calculate the value of an expression.
# This will run the forward step of the neural network.
print(mat1.value())
print(mat1.npvalue())    # as numpy array
print(v.vec_value())     # as vector, if vector
print(x.scalar_value())  # as scalar, if scalar
print(x.value())         # choose the correct one

Create Parameters¶

Parameters are things need to be trained. In contrast to a system like Torch where computational modules may have their own parameters, in DyNet parameters are just parameters.

[ ]:

# Parameters are things we tune during training.
# Usually a matrix or a vector.

# First we create a parameter collection and add the parameters to it.
m = dy.ParameterCollection()
W = m.add_parameters((8,8)) # an 8x8 matrix, return an expr
b = m.add_parameters(8) # an 8x1 vector, return as expr

It should be noticed that in DyNet 2.0.4+, the dy.parameters() is depecated so explicitly adding parameters to the computation graph is no longer necessary. Any used parameter will be added automatically.

[ ]:

# There are several ways to initial parameters
# Specifiying parameter initialization
scale, mean, stddev = 1, 0, 1

# Creates 3x5 matrix filled with 0 (or any other float)
p1 = m.add_parameters((3,5), init=0)
# Creates 3x5 matrix initialized with U([-scale, scale])
p2 = m.add_parameters((3,5), init='uniform', scale=scale)
# Creates 3x5 matrix initialized with N(mean, stddev)
p3 = m.add_parameters((3,5), init='normal', mean=mean, std=stddev)
# Creates 5x5 identity matrix
p4 = m.add_parameters((5,5), init='identity')
# Creates 3x5 matrix with glorot init
p5 = m.add_parameters((3,5), init='glorot')
p6 = m.add_parameters((3,5)) # By default, the init = 'glorot'
# Creates 3x5 matrix with he init
p7 = m.add_parameters((3,5), init='he')
# Creates 3x5 matrix from a numpy array (size is inferred)
p8 = m.add_parameters((3,5), np.ones((3,5)))

Create LookupParameters¶

LookupParameters represents a table of parameters. They are used to embed a set of discrete objects (e.g. word embeddings). They can be sparsely updated.

[ ]:

## ===== Lookup parameters
# Similar to parameters, but are representing a "lookup table" that maps numbers to vectors.
# These are often used for things like word embeddings.
# For example, this will have VOCAB_SIZE rows, each of DIM dimensions.
VOCAB_SIZE = 100
DIM = 10
lp = m.add_lookup_parameters((VOCAB_SIZE, DIM))

[ ]:

# Create expressions from lookup parameters.
e5  = dy.lookup(lp, 5)   # create an Expression from row 5.
e5  = lp[5]              # same
e5c = dy.lookup(lp, 5, update=False)  # as before, but don't update when optimizing.

e45  = dy.lookup_batch(lp, [4, 5])   # create a batched Expression from rows 4 and 5.
e45  = lp.batch([4, 5])
print('e45 dim:', e45.dim())

e0_9 = dy.lookup_batch(lp, range(10))  # create a batched Expression from rows 0 to 9
e0_9 = lp.batch(range(10))
print('e0_9 dim:', e0_9.dim())

e5.set(10)  # now the e5 expression contains row 10
print('e5 dim after applying set method', e5.dim())
print(e5.value())

# We can check if it is actually containing row 10
e10 = lp[10]
print(e5.value() == e10.value())

[ ]:

# Similar to Parameters, we have several ways to
# initialize LookupParameters.
scale, mean, stddev = 1, 0, 1

# Creates 3x5 matrix filled with 0 (or any other float)
lp1 = m.add_lookup_parameters((3,5), init=0)
# Creates 3x5 matrix initialized with U([-scale, scale])
lp2 = m.add_lookup_parameters((3,5), init='uniform', scale=scale)
# Creates 3x5 matrix initialized with N(mean, stddev)
lp3 = m.add_lookup_parameters((3,5), init='normal', mean=mean, std=stddev)
# Creates 5x5 identity matrix
lp4 = m.add_lookup_parameters((5,5), init='identity')
# Creates 3x5 matrix with glorot init
lp5 = m.add_lookup_parameters((3,5), init='glorot')
lp6 = m.add_parameters((3,5)) # By default, the init = 'glorot'
# Creates 3x5 matrix with he init
lp7 = m.add_lookup_parameters((3,5), init='he')
# Creates 3x5 matrix from a numpy array (size is inferred)
lp8 = m.add_lookup_parameters((3,5), np.ones((3,5)))
# Creates 3x5 matrix from a numpy array (size is inferred)

More Expression Manipulation¶

DyNet provides hundreds of operations on Expressions. The user can manipulate Expressions, or build complex Expression easily.

[ ]:

# Fist we create some vector Expressions.
e1 = dy.vecInput(4)
e1.set([1, 2, 3, 4])

e2 = dy.vecInput(4)
e2.set([5, 6, 7, 8])

# Concatenate list of expressions to a single batched expression.
# All input expressions must have the same shape.
e_batch = dy.concatenate_to_batch([e1, e2])

mat1 = dy.inputTensor(np.array([[1, 2], [3, 4], [5, 6], [7, 8]]))  # A 4x2 matrix
mat2 = dy.inputTensor(np.array([[1, 0], [0, 1]]))  # A 2x2 matrix

[ ]:

# Basic Math Operations

# Add
e = e1 + e2  # Element-wise addition

# Minus
e = e2 - e1 # Element-wise minus
# Negative
e = -e1  # Should be [-1.0, -2.0, -3.0, -4.0]

# Multiply
e = e1 * dy.transpose(e1)  #It's Matrix multiplication (like e1.dot(e2) in numpy)

mat = mat1 * mat2

# Dot product
e = dy.dot_product(e1, e2)  # dot product = sum(component-wise multiply)

# Component-wise multiply
e = dy.cmult(e1, e2)

# Component-wise division
e = dy.cdiv(e1, e2)

# Column-wise addition
# colwise_add(x, y)
#  x:  An MxN matrix
#  y:  A length M vector
mat = dy.colwise_add(mat1, e1)  # column-wise addition

# Useful math operations
# abs()
e = dy.abs(e1)

# cube()
# Elementwise cubic
e = dy.cube(e1)

# exp()
e = dy.exp(e1)

# pow()
# For each element in e1, calculate e1^{y}
e = dy.pow(e1, dy.inputTensor([2]))
e_ = dy.square(e1)
assert e.value() == e_.value()

# bmin()
# Calculate an output where the ith element is min(x_i, y_i)
e = dy.bmin(e1, e2)
assert e.value() == e1.value()

# bmax()
# Calculate an output where the ith element is max(x_i, y_i)
e = dy.bmax(e1, e2)
assert e.value() == e2.value()

# sin()
e = dy.sin(e1)

# cos()
e = dy.cos(e1)

# tan()
e = dy.tan(e1)

# asin()
e = dy.asin(e1)

# acos()
e = dy.acos(e1)

# atan()
e = dy.atan(e1)

# sinh()
e = dy.sinh(e1)

# cosh()
e = dy.cosh(e1)

# tanh()
e = dy.tanh(e1)

# asinh()
e = dy.asinh(e1)

# acosh()
e = dy.acosh(e1)

# atanh
e = dy.atanh(e1)

# square()
e = dy.square(e1)

# sqrt()
e = dy.sqrt(e1)

[ ]:

# Matrix manipulation

# Reshape
new_dimension = (2, 2)
e = dy.reshape(e1, new_dimension)  # Col major
print('reshape a vector:\n', e.value())

# Transpose
e = dy.transpose(e1)
print('e1 dimension:', e1.dim())
print('e1 transpose dimension', e.dim())

# inverse
# Not implemented on GPU yet.
e = dy.inverse(dy.inputTensor([[1, 3], [3, 1]]))
print('Inverse a matrix:\n', e.npvalue())

# logdet
# Not implemented on GPU yet
e = dy.logdet(dy.inputTensor([[1, 0], [0, 2]]))
print('logdet diag(1,2) is log(2):\n', e.npvalue())

# trace_of_product
# Not implemented on GPU yet
diag_12 = dy.inputTensor([[1, 0], [0, 2]])
e = dy.trace_of_product(diag_12, diag_12)
# or on matrix
e = dy.trace_of_product(mat1, mat1)

# circ_conv
sig_1 = dy.inputTensor([1,2,1,0])
sig_2 = dy.inputTensor([0,1,1,1])
e = dy.circ_conv(sig_1, sig_2)

# circ_corr
e = dy.circ_corr(sig_1, sig_2)

[ ]:

# Other Per-element unary functions.

# erf()
# Elementwise calculation of the Gaussian error function
e = dy.erf(e1)

# log()
e = dy.log(e1)

# log_sigmoid()
e = dy.log_sigmoid(e1)

# lgamma()
# Definition of gamma function ca be found here
# https://en.wikipedia.org/wiki/Gamma_function
e = dy.lgamma(e1)
e_ = dy.log(dy.inputTensor([1, 1, 2, 6]))
assert e.value() == e_.value()

# sigmoid()
e = dy.logistic(e1)   # Sigmoid(x)

# rectify()
# Rectifier (or ReLU, Rectified Linear Unit)
e = dy.rectify(e1)    # Relu (= max(x,0))

# elu()
# Exponential Linear Unit (ELU)
# Definition can be found here:
# https://en.wikipedia.org/wiki/Rectifier_(neural_networks)#ELUs
e = dy.elu(e1)

# selu()
# Scaled Exponential Linear Unit (SELU)
# Definition can be found here:
# https://arxiv.org/abs/1706.02515
e = dy.selu(e1)

# silu()
# Sigmoid Linear Unit / sigmoid-weighted linear unit
# SILU / SiL / Swish
# Definition can be found here:
# https://openreview.net/pdf?id=Bk0MRI5lg
e = dy.silu(e1)

# sparsemax()
# **Note:** This function is not yet implemented on GPU.
# Similar to softmax, but induces sparse solutions where
# most of the vector elements are zero.
e = dy.sparsemax(e1)

# softsign()
e = dy.softsign(e1)    # x/(1+|x|)

# softmax()
e = dy.softmax(e1)
print('softmax result:', e.value())

# log_softmax
# logsoftmax = logits - log(reduce_sum(exp(logits), dim))
# restrict is a set of indices. if not empty, only entries
# in restrict are part of softmax computation, others get -inf.
e_log_softmax = dy.log_softmax(e1)
e_log_softmax = dy.log_softmax(e1, restrict=[0,1,2])
print('log_softmax result', e_log_softmax.value())

# constrained_softmax()
# **Note:** This function is not yet implemented on GPU.
# similar to softmax, but defines upper bounds for the resulting probabilities.
e = dy.constrained_softmax(e1, dy.inputTensor([0.01, 0.05, 0.10, 0.55]))
print('constrained_softmax result', e.value())

[ ]:

# Picking values from vector expressions

k, v = 1, 3
# Pick one element from a vector or matrix
# similar to python's e1[k] for list.
# k can be negative, which has exactly the same behavior
# as it is in python
e = dy.pick(e1, k)
print('The {} element of vector is {}'.format(k+1, e.value())) # index starts from 0
# which is also equivalent to:
e = e1[k]
# k can be negative. -1 means the last element
e = e1[-1]
print(e.value())

mat = dy.pick(mat1, k)
print('The {} element of matrix mat1 is {}'.format(k+1, mat.value()))
# which is equivalent to:
mat = mat1[k]

# Pick several elements from a vector or matrix
# similar to python's e1[k:v] for lists.
# e1 is an Expression, k, v are integers.
# Important: v should not exceed the e1's dimension.
e = dy.pickrange(e1, k, v)
print('Pick range[k, v) from a vector', e.value())
# which is also equivalent to:
e = e1[k:v]
e = e1[:v]  # similar to python, you can neglect k
e = e1[:]   # or even both k and v
# ERROR: Don't try this
# e = e1[0:10], the v value should not exceed the dimension.

mat = dy.pickrange(mat1, k, v)
print('Pick range[k, v) from a matrix:\n', mat.value())

# pickneglogsoftmax
# which is equivalent to: dy.pick(-dy.log(dy.softmax(e1)), k)
e = dy.pickneglogsoftmax(e1, k)
e_ = dy.pick(-dy.log(dy.softmax(e1)), k)
print('{0:.6f} and {0:6f}'.format(e.value(), e_.value()))

# pickneglogsoftmax_batch
# similar to pickneglogsoftmax, this is negative softmax log likelihood on a batch
# The difference is, a list of intergers is required for True classes.
e = dy.pickneglogsoftmax_batch(e1, [k])

[ ]:

# Selecting vectors from matrix Expressions

# select_rows
# works similar to pickrange
e = dy.select_rows(mat1, [0,1])
e_ = mat1[:2]
assert np.all(e.value() == e_.value())

# select_cols
e = dy.select_cols(mat1, [0])

[ ]:

# Expressions concatenation & other useful manipuulations

# This performs an elementwise sum over all the expressions included.
# All expressions should have the same dimension.
e = dy.esum([e1, e2])
# which is equivalent to:
e_ = e1 + e2
assert e.value() == e_.value()

# This performs an elementwise average over all the expressions included.
# All expressions should have the same dimension.
e = dy.average([e1, e2])
# which is equivalent to:
e_ = (e1 + e2)/2
assert e.value() == e_.value()

# Concate vectors/matrix column-wise
# All expressions should have the same dimension.
# e1, e2,.. are column vectors. return a matrix. (sim to np.hstack([e1,e2,...])
e = dy.concatenate_cols([e1, e2])
print('vector and vector concatenate_cols:\n', e.value())

mat = dy.concatenate_cols([mat1, e2])
print('mattix and vector concatenate_cols:\n', mat.value())

# Concate vectors/matrix
# All expressions should have the same dimension.
# e1, e2,.. are column vectors. return a matrix. (sim to np.hstack([e1,e2,...])
e = dy.concatenate([e1, e2])
print('vector concatenate:', e.value())

mat = dy.concatenate([mat2, mat2])
print('matrix concatenate:\n',mat.value())

# affine transform
e0 = dy.vecInput(2)
e0.set([-1, 0])
e = dy.affine_transform([e1,mat1,e0])
print('affine_transform:', e.value())

# sum_elems
# Sum all elements
e = dy.sum_elems(mat1)
print('sum_elems:', e.value())

# sum_dim
# sum_dim(Expression x, list d, bool b=False, unsigned n=0)
# d (list): Dimensions along which to reduce
# b (bool): Whether to include batch dimension
# Sum along an arbitrary dimension
# Here, mat1 has dimension ((4, 2), 1),
e = dy.sum_dim(mat1, [0])
print('sum over the 0th dimension', e.value())

e = dy.sum_dim(mat1, [1])
print('sum over the 1st dimension', e.value())

# sum_batches
# Sum an expression that consists of multiple minibatches into
# one of equal dimension but with only a single minibatch.
# This is useful for summing loss functions at the end of minibatch training.
e = dy.sum_batches(e1)

# cumsum
# usage: cumsum(Expression x, unsigned d=0)
# Computes the cumulative sum along an arbitrary dimension.
# d (int): Dimension along which to compute the cumulative sums (default: 0)
e = dy.cumsum(mat1, 1)
print('cumsum:\n', e.value())

# mean_elems
# Computes the mean of all the elements of each minibatch.
e = dy.mean_elems(mat1)
print('mean_elems', e.value()) # will return 4.5 in this case.

# mean_dim
# usage:  mean_dim(Expression x, list d, bool b, unsigned n=0)
#         x (dynet.Expression): Input expression
#         d (list): Dimensions along which to reduce
#         b (bool): Whether to include batch dimension
#         n (int): If > 0, overwrite the n in the equation by this value, useful for masking
# Computes the mean along an arbitrary dimension.
e = dy.mean_dim(mat1, [0], True)
print('mean_dim:', e.value())
e_ = dy.mean_dim(mat1, [1], True)
print('mean_dim:', e_.value())

# mean_batches
# Mean along the batch dimension
e = dy.mean_batches(mat1)

# std_elems
# Computes the standard deviation of all the elements of each minibatch
e = dy.std_elems(mat1)
print('std_elems:', e.value())

# std_dim
# usage:  std_dim(Expression x, list d, bool b, unsigned n=0)
#         x (dynet.Expression): Input expression
#         d (int): Dimensions along which to reduce
#         b (bool): Whether to include batch dimension
#         n (int): If > 0, overwrite the n in the equation by this value, useful for masking
# Computes the standard deviation along arbitrary dimensions.
e = dy.std_dim(mat1, [0], True)
print('std_dim', e.value())

# std_batches
# Standard deviation along the batch dimension
e = dy.std_batches(mat1)

# moment_elems
# Statistical moment of elements of the tensor
# usage:  moment_elems(Expression x, unsigned r)
#         x (dynet.Expression): Input expression
#         r (int): Moment order
e = dy.moment_elems(mat1, 1)
print('moment_elems:', e.value())

# moment_dim
# Statistical moment along an arbitrary dimension
# usage:  moment_dim(Expression x, list d, unsigned r, bool b, unsigned n=0)
#             x (dynet.Expression): Input expression
#             d (list): Dimensions along which to reduce
#             r (int): Moment order
#             b (bool): Whether to include batch dimension
#             n (int): If > 0, overwrite the n in the equation by this value, useful for masking
e = dy.moment_dim(mat1, [0], 2, False)
print('moment_dim:', e.value())

# moment_batches
# Statistical moment along the batch dimension
e = dy.moment_batches(mat1, 2)

# fold_rows
# usage: fold_rows(Expression x, unsigned nrows=2)
e = dy.fold_rows(mat1)
print('fold_rows:\n', e.value())

DyNet in Neural Networks¶

This part contains Neural Networks related issues.

[ ]:

# Noise and Dropout Expressions

# Add a noise to each element from a gausian distribution
# with standard-dev = stddev
stddev = 0.1
e = dy.noise(e1, stddev)
print('noise for stddev=0.1:', e.value())

# Apply dropout to the input expression
# There are two kinds of dropout methods
# (http://cs231n.github.io/neural-networks-2)
# Dynet implement the Inverted dropout where dropout with prob p
# and scaling others by 1/p at training time, and do not need
# to do anything at test time.
p = 0.5
e = dy.dropout(e1, p)    # apply dropout with probability p
print('dropout at probability 0.5:', e.value()) # It should be [2.0, 4.0, 6.0, 0.0], the last element is dropped out and the rest are scaled

# If we set p=1, everything will be dropped out
e = dy.dropout(e1, 1)
print('dropout at probbability 1:', e.value()) # Should be [nan, nan, ...]

# If we set p=0, everything will be kept
e = dy.dropout(e1, 0)
assert e.value() == e1.value()

[ ]:

# Loss Functions

# DyNet provides several ways to calculate "distance"
# between two expressions of the same dimension
# This is square_distance, defined as
# sum(square of(e1-e2)) for all elements
# in e1 and e2.
# Here e1 is a vector of [1,2,3,4]
# And e2 is a vector of [5,6,7,8]
# The square distance is sum((5-1)^2 + (6-2)^2+...)
e = dy.squared_distance(e1, e2)
print('squared_distance:', e.value())

# This is the l1_distance, defined as
# sum (abs(e1-e2)) for all elements in
# e1 and e2.
e = dy.l1_distance(e1, e2)
print('l1_distance:', e.value())

# This is the huber_distance, definition
# found here. (https://en.wikipedia.org/wiki/Huber_loss)
# The default threhold (delta) is 1.345.
# Here e1 is a vector of [1,2,3,4]
# And e2 is a vector of [5,6,7,8]
# because for each pair-wised element in
# e1 and e2, the abs(e1-e2)=4>delta=1.345,
# so the output is sum(delta*(abs(4)-1/2*delta))
e = dy.huber_distance(e1, e2, c=1.345)
print('huber distance:', e.value())

# Binary logistic loss function
# This is similar to cross entropy loss function
# e1 must be a vector that takes values between 0 and 1
# ty must be a vector that takes values between 0 and 1
# e = -(ty * log(e1) + (1 - ty) * log(1 - e1))
ty = dy.vecInput(4)
ty.set([0, 0.5, 0.5, 1])
e_scale = ty = dy.vecInput(4)
e_scale.set([0.5, 0.5, 0.5, 0.5])
e = dy.binary_log_loss(e_scale, ty)
print('binary_log_loss:', e.value())
# Te binary_log_loss is equivalent to the following:
e_equl = -(dy.dot_product(ty, dy.log(e_scale)) + dy.dot_product((dy.inputTensor([1,1,1,1]) - ty), dy.log(dy.inputTensor([1,1,1,1]) - e_scale)))
assert e_equl.value() == e.value()

# pairwise_rank_loss
# e1 is row vector or scalar
# e2 is row vector or scalar
# m is number
# e = max(0, m - (e1 - e2))
e = dy.pairwise_rank_loss(dy.transpose(e1), dy.transpose(e2), m=1.0) # Row vector needed, so we transpose the vector.
print('pairwise_rank_loss wrt e1:', e.value())  # Expect [[5. 5. 5. 5.]]

e = dy.pairwise_rank_loss(dy.transpose(e2), dy.transpose(e1), m=1.0) # Row vector needed, so we transpose the vector.
print('pairwise_rank_loss wrt e2:', e.value())  # Expect [[0. 0. 0. 0.]]

# poisson_loss
# The negative log probability of y according to a Poisson distribution with parameter x.
# Useful in Poisson regression where, we try to predict the parameters of a Possion
# distribution to maximize the probability of data y.
# usage: poisson_loss(Expression log_lambda, unsigned x)
#        log_lambda (dynet.Expression): The log of the Poisson distribution's lambda
#        x (int): The target value
e_scalar = dy.scalarInput(2)
e = dy.poisson_loss(e_scalar, 1)
print('poisson_loss:', e.value())

# hinge, aka SVM loss
# usage: hinge(Expression x, unsigned v, float m=1.0)
#         x (dynet.Expression): Input scores
#         v (int): True class
#         m (float): The margin
# here we have e1.value() equals to [1,2,3,4] and the true class is set to be 3 (index 2)
# and we manually set m to be zero.
# so, for each element in e1, we will calculate element-true_class_value+m
# in this case, it is [max(0, 1-3+0), max(0, 2-3+0), max(0, 3-3+0), max(0, 4-3+0)]
# and finally sum it uo, which is 1.
e = dy.hinge(e1, 2, m=0)
print('hinge loss:', e.value())

# binary_log_loss
# The log loss of a binary decision according to the sigmoid sigmoid function.
e_scale1 = dy.vecInput(3)
e_scale1.set([0.1, 0.2, 0.7])
e_scale2 = dy.vecInput(3)
e_scale2.set([0.5, 0.2, 0.3])
e = dy.binary_log_loss(e_scale1, e_scale2)

[ ]:

# Convolutions

# DyNet can do convolutions similar to PyTorch.
# First we mock an image and a filter
# mat is a 3D tensor of dim{4,4,3}
# kernel is a 4d Tensor of shape {2,2,3,1}
mat = dy.inputTensor(np.array([[[1,2,1], [0,1,2], [0,0,1], [0,1,0]], [[1,0,2], [0,0,0], [1,1,1], [2,2,2]], [[0,1,2], [1,1,0], [0,0,1], [2,2,1]], [[2,2,0], [2,1,2], [2,2,1], [1,1,0]]]))
kernel = dy.inputTensor(np.array([[[[1], [0], [2]], [[1], [2], [0]]], [[[0], [1], [0]], [[2], [1], [1]]]]))
print(mat.dim(), kernel.dim())

# filter1d_narrow()
# usage: e = dy.filter1d_narrow(e1, e2)
#        e1: Expression of n x s
#        e2: Expression of n x d
# This function will calculate the convolution along each dimension.
# For example, 0.8 = 1*0 + 2*0.1 + 3*0.2 + 0*0.5
#              1.6 = 2*0 + 3*0.1 + 4*0.2 + 1*0.5
#              1.4 = 3*0 + 4*0.1 + 0*0.2 + 2*0.5
e_input = dy.inputTensor([[1,2,3], [2,3,4], [3,4,0], [0,1,2]])
e_filter = dy.inputTensor([[0], [0.1], [0.2], [0.5]])
e = dy.filter1d_narrow(e_input, e_filter)
print('filter1d_narrow', e.npvalue()) # expect [[0.8 1.6 1.4]]


# conv2d
# This is 2D convolution operator without bias parameters.
# dy.conv2d(Expression x, Expression f, vector[unsigned] stride, bool is_valid = True)
# x: The input feature maps: (H x W x Ci) x N (ColMaj), 3D tensor with an optional batch dimension
# f: 2D convolution filters: H x W x Ci x Co (ColMaj), 4D tensor
# stride: the row and column strides in a list
# is_valid: padding method. True for 'Valid' and False for 'Same'.
#     'Valid': output size shrinks by `filter_size - 1`, and the filters always sweep at valid
#              positions inside the input maps. No padding needed.
#     'Same': output size is the same with input size. To do so, one needs to pad the input so
#             the filter can sweep outside of the input maps.
e = dy.conv2d(mat, kernel, stride=[1, 1], is_valid=True)
print('con2d without bias:\n', e.npvalue())

# conv2d_bias
# This is 2D convolution operator with bias parameters.
# dy.conv2d_bias(Expression x, Expression f, Expression b, vector[unsigned] stride, bool is_valid = True)
# b: A vector representing bias. (Ci x 1)
bias = dy.inputTensor([1])
e = dy.conv2d_bias(mat, kernel, bias, stride=[1, 1], is_valid=True)
print('conv2d with bias:\n', e.npvalue())

# maxpooling2d
# Usage: maxpooling2d(Expression x, vector[unsigned] ksize, vector[unsigned] stride, bool is_valid = True)
# x: The input feature maps: (H x W x Ci) x N (ColMaj), 3D tensor with an optional batch dimension
# ksize (list): the max pooling 2d window size
# stride (list): the row and column strides
# Here we use a [2,2] maxpooling window and stride is 1 in rows and 2 in cols.
# Given a ((4, 4, 3), 1) matrix , we expect a (3,2,3) * 1 output
e = dy.maxpooling2d(mat, [2, 2], [1, 2], is_valid=True)

# kmax_pooling
# Usage: kmax_pooling(Expression x, unsigned k, unsigned d=1
# Select out k maximum values along a given dimension, in the same order as they appear.
# This will result in the size of the given dimension being changed to k.
#       unsigned k (dynet.Expression): Number of maximum values to retrieve along the given dimension
#       unsigned d (int): Dimension on which to perform kmax-pooling (default: (1))
# Given a ((4, 4, 3), 1) matrix:
e = dy.kmax_pooling(mat, 2, 0) # we expect a ((2, 4, 3), 1) output.
e = dy.kmax_pooling(mat, 2, 1) # we expect a ((4, 2, 3), 1) output.
e = dy.kmax_pooling(mat, 2, 2) # we expect a ((4, 4, 2), 1) output.

# kmh_ngram
# usage: kmh_ngram(Expression x, unsigned v)
# x should be an Expression.
# v should be an int and should be less than number of cols
e = dy.kmh_ngram(mat1, 2)
print('kmh_ngram\n',e.npvalue())

[ ]:

# Backpropagation and Gradient related

# nobackprop
#  This node has no effect on the forward pass, but prevents gradients from flowing backward during the backward pass.
# This is useful when there's a subgraph for which you don't want loss passed back to the parameters.
e = dy.nobackprop(e1)

# flip_gradient
# This node has no effect on the forward pass, but takes negative on backprop process.
# This operation is widely used in adversarial networks.
e = dy.flip_gradient(e1)

# scale_gradient
# This node scales the gradient by a constant on backprop, with no effect on the forward pass
e = dy.scale_gradient(e1, lambd=2)

# argmax
# This node takes an input vector x and returns a one hot vector y.
# There are two gradient modes for this operation:
# "zero_gradient": is the standard argmax operation. Note that this almost everywhere differentiable and its gradient is 0. **It will stop your gradient**
# "straight_through_gradient": Its forward pass is the same as the argmax operation, but its gradient is the same as the identity function.
# Find more information here at: https://arxiv.org/abs/1308.3432
e = dy.argmax(dy.inputTensor([1,2,3]), gradient_mode='zero_gradient')

[ ]:

# Normalization

# Squared norm
e = dy.squared_norm(e1)

# l2_norm
e = dy.l2_norm(e1)

# layer_norm
# Usage: layer_norm(Expression x, Expression g, Expression b)
# x (dynet.Expression): Input expression (possibly batched)
# g (dynet.Expression): Gain (same dimension as x, no batch dimension)
# b (dynet.Expression): Bias (same dimension as x, no batch dimension)
# details can be found here: https://arxiv.org/abs/1607.06450
# mu = average(e1) = 2.5
# delta = sqrt(1/4*sum(e1-mu)) = 1.118033988749895
# e_norm = g/delta * (x-mu) + b
e_norm = dy.layer_norm(e1, e1, e1)
print('layer norm:', e_norm.npvalue())

# weight_norm
# Usage: weight_norm(Expression w, Expression g)
#       w (dynet.Expression): Input expression (weight parameter)
#       g (dynet.Expression): Gain (scalar expression, usually also a parameter)
e_norm = dy.weight_norm(e1, dy.scalarInput(2))
print('weight norm:', e_norm.npvalue() )

Write your own Neural Networks¶

Now that you have a basic idea about APIs, you can try to write simple Neural Networks of your own.

Here we have some toy models for your reference.

In the first example, we creat a simple Neural Networks which could learn to predict the minimal value given a 1 x 3 input vector.

[ ]:

from __future__ import print_function
from __future__ import division
from __future__ import absolute_import

import dynet_config
# Declare GPU:0 as the default device type
# dynet_config.set_gpu()
import dynet as dy

# reset the global cg
dy.renew_cg()
# create parameter collection
m = dy.ParameterCollection()

# add parameters to parameter collection
pW = m.add_parameters((10,30))
pB = m.add_parameters(10)
lookup = m.add_lookup_parameters((500, 10))
print("Parameters added.")

# create trainer
trainer = dy.SimpleSGDTrainer(m)

# Regularization is set via the --dynet-l2 commandline flag.
# Learning rate parameters can be passed to the trainer:
# alpha = 0.1  # learning rate
# trainer = dy.SimpleSGDTrainer(m, e0=alpha)

# function for graph creation
def create_network_return_loss(inputs, expected_output):
    """
    inputs is a list of numbers
    """
    dy.renew_cg()
    emb_vectors = [lookup[i] for i in inputs]
    net_input = dy.concatenate(emb_vectors)
    net_output = dy.softmax( (pW*net_input) + pB)
    loss = -dy.log(dy.pick(net_output, expected_output))
    return loss

# function for prediction
def create_network_return_best(inputs):
    """
    inputs is a list of numbers
    """
    dy.renew_cg()
    emb_vectors = [lookup[i] for i in inputs]
    net_input = dy.concatenate(emb_vectors)
    net_output = dy.softmax( (pW*net_input) + pB)
    return np.argmax(net_output.npvalue())


# train network
for epoch in range(5):
    for inp,lbl in ( ([1,2,3],1), ([3,2,4],2) ):
        loss = create_network_return_loss(inp, lbl)
        print(loss.value()) # need to run loss.value() for the forward prop
        loss.backward()
        trainer.update()

print('Predicted smallest element among {} is {}:'.format([1,2,3], create_network_return_best([1,2,3])))

You can also rewrite this example in a more fancy way. You create an object which has create_network_return_loss() and create_network_return_best() public method.

[ ]:

from __future__ import print_function
from __future__ import division
from __future__ import absolute_import

import dynet_config
# Declare GPU as the default device type
# dynet_config.set_gpu()
import dynet as dy

dy.renew_cg()
# create parameter collection
m = dy.ParameterCollection()

# create a class encapsulating the network
class OurNetwork(object):
    # The init method adds parameters to the parameter collection.
    def __init__(self, pc):
        self.pW = pc.add_parameters((10,30))
        self.pB = pc.add_parameters(10)
        self.lookup = pc.add_lookup_parameters((500,10))

    # the __call__ method applies the network to an input
    def __call__(self, inputs):
        lookup = self.lookup
        emb_vectors = [lookup[i] for i in inputs]
        net_input = dy.concatenate(emb_vectors)
        net_output = dy.softmax((self.pW * net_input) + self.pB)
        return net_output

    def create_network_return_loss(self, inputs, expected_output):
        dy.renew_cg()
        out = self(inputs)
        loss = -dy.log(dy.pick(out, expected_output))
        return loss

    def create_network_return_best(self, inputs):
        dy.renew_cg()
        out = self(inputs)
        return np.argmax(out.npvalue())


# create network
network = OurNetwork(m)

# create trainer
trainer = dy.SimpleSGDTrainer(m)

# train network
for epoch in range(5):
    for inp,lbl in ( ([1,2,3],1), ([3,2,4],2) ):
        loss = network.create_network_return_loss(inp, lbl)
        print(loss.value()) # need to run loss.value() for the forward prop
        loss.backward()
        trainer.update()

print('Predicted smallest element among {} is {}:'.format([1,2,3], network.create_network_return_best([1,2,3])))

Or alternatively, have the training outside of the network class.

[ ]:

# create network
network = OurNetwork(m)

# create trainer
trainer = dy.SimpleSGDTrainer(m)

# train network
for epoch in range(5):
    for inp,lbl in ( ([1,2,3],1), ([3,2,4],2) ):
        dy.renew_cg()
        out = network(inp)
        loss = -dy.log(dy.pick(out, lbl))
        print(loss.value()) # need to run loss.value() for the forward prop
        loss.backward()
        trainer.update()

print(np.argmax(network([1,2,3]).npvalue()))

[ ]: