Addition of extra nonlinearities for experiments (#1149)

* Add extra nonlinearities to experiment with * Add notes on scores for various non-linearities * Isolate import of nonlinearities which were not available until after torch 1.3.0 Co-authored-by: Hung Manh Bui <hung0411@sc.stanford.edu> Co-authored-by: John Bauer <horatio@gmail.com>
author: Hung Bui <86261282+hungbui0411@users.noreply.github.com> 2022-11-09 10:30:56 +0300
committer: GitHub <noreply@github.com> 2022-11-09 10:30:56 +0300
commit: d0a729801412372cb553a3328010675f404a1dca (patch)
tree: 93c70874178bae4fb88ffd738f771c83ddb6eec0
parent: eb46dda76a275d66ef5a23a2de35a7f4e84d6fbb (diff)
1 files changed, 58 insertions, 6 deletions
diff --git a/stanza/models/constituency/utils.py b/stanza/models/constituency/utils.py
index e3bfa525..4f46abea 100644
--- a/stanza/models/constituency/utils.py
+++ b/stanza/models/constituency/utils.py
@@ -105,21 +105,73 @@ def retag_trees(trees, pipeline, xpos=True):
             raise ValueError("Failed to properly retag tree #{}: {}".format(tree_idx, tree)) from e
     return new_trees
 
+
+# experimental results on nonlinearities
+# this is on a VI dataset, VLSP_22, using 1/10th of the data as a dev set
+# (no released test set at the time of the experiment)
+# original non-Bert tagger, with 1 iteration each instead of averaged over 5
+# considering the number of experiments and the length of time they would take
+#
+# Gelu has the highest score, which tracks with other experiments run.
+# Note that publicly released models have typically used Relu
+# on account of the runtime speed improvement
+#
+# Gelu: 82.32
+# Relu: 82.14
+# Mish: 81.95
+# Relu6: 81.91
+# Silu: 81.90
+# ELU: 81.73
+# Hardswish: 81.67
+# Softsign: 81.63
+# Hardtanh: 81.44
+# Celu: 81.43
+# Selu: 81.17
+#   TODO: need to redo the prelu experiment with
+#         possibly different numbers of parameters
+#         and proper weight decay
+# Prelu: 80.95 (terminated early)
+# Softplus: 80.94
+# Logsigmoid: 80.91
+# Hardsigmoid: 79.03
+# RReLU: 77.00
+# Hardshrink: failed
+# Softshrink: failed
 NONLINEARITY = {
-    'tanh':       nn.Tanh,
-    'relu':       nn.ReLU,
+    'celu':       nn.CELU,
+    'elu':        nn.ELU,
     'gelu':       nn.GELU,
+    'hardshrink': nn.Hardshrink,
+    'hardtanh':   nn.Hardtanh,
     'leaky_relu': nn.LeakyReLU,
+    'logsigmoid': nn.LogSigmoid,
+    'prelu':      nn.PReLU,
+    'relu':       nn.ReLU,
+    'relu6':      nn.ReLU6,
+    'rrelu':      nn.RReLU,
+    'selu':       nn.SELU,
+    'softplus':   nn.Softplus,
+    'softshrink': nn.Softshrink,
+    'softsign':   nn.Softsign,
+    'tanhshrink': nn.Tanhshrink,
+    'tanh':       nn.Tanh,
 }
 
 # separating these out allows for backwards compatibility with earlier versions of pytorch
 # NOTE torch compatibility: if we ever *release* models with these
 # activation functions, we will need to break that compatibility
-if hasattr(nn, 'SiLU'):
-    NONLINEARITY['silu'] = nn.SiLU
 
-if hasattr(nn, 'Mish'):
-    NONLINEARITY['mish'] = nn.Mish
+nonlinearity_list = [
+    'GLU',
+    'Hardsigmoid',
+    'Hardswish',
+    'Mish',
+    'SiLU',
+]
+
+for nonlinearity in nonlinearity_list:
+    if hasattr(nn, nonlinearity):
+        NONLINEARITY[nonlinearity.lower()] = getattr(nn, nonlinearity)
 
 def build_nonlinearity(nonlinearity):
     """
author	Hung Bui <86261282+hungbui0411@users.noreply.github.com>	2022-11-09 10:30:56 +0300
committer	GitHub <noreply@github.com>	2022-11-09 10:30:56 +0300
commit	d0a729801412372cb553a3328010675f404a1dca (patch)
tree	93c70874178bae4fb88ffd738f771c83ddb6eec0
parent	eb46dda76a275d66ef5a23a2de35a7f4e84d6fbb (diff)