add weight clip to self-adversarial negative sampling

KiddoZhu · KiddoZhu · commit c67605ceec9a · 2019-11-02T16:48:55.000-04:00
diff --git a/include/instance/gpu/knowledge_graph.cuh b/include/instance/gpu/knowledge_graph.cuh
@@ -103,8 +103,12 @@ __global__ void train(Memory<Vector, Index> head_embeddings, Memory<Vector, Inde
                 sample_loss += weight * -log(prob + kEpsilon);
             } else {
                 gradient = prob;
-                if (adversarial_temperature > kEpsilon)
+                if (adversarial_temperature > kEpsilon) {
                     weight = safe_exp((logit - bias) / adversarial_temperature) / normalizer;
+                    // the normalizer may be out of date in ASGD
+                    // so we need to clip the weight
+                    weight = min(weight, Float(1));
+                }
                 else
                     weight = 1.0 / num_negative;
                 sample_loss += weight * -log(1 - prob + kEpsilon);
@@ -198,8 +202,12 @@ __global__ void train_1_moment(Memory<Vector, Index> head_embeddings, Memory<Vec
                 sample_loss += weight * -log(prob + kEpsilon);
             } else {
                 gradient = prob;
-                if (adversarial_temperature > kEpsilon)
+                if (adversarial_temperature > kEpsilon) {
                     weight = safe_exp((logit - bias) / adversarial_temperature) / normalizer;
+                    // the normalizer may be out of date in ASGD
+                    // so we need to clip the weight
+                    weight = min(weight, Float(1));
+                }
                 else
                     weight = 1.0 / num_negative;
                 sample_loss += weight * -log(1 - prob + kEpsilon);
@@ -298,8 +306,12 @@ __global__ void train_2_moment(Memory<Vector, Index> head_embeddings, Memory<Vec
                 sample_loss += weight * -log(prob + kEpsilon);
             } else {
                 gradient = prob;
-                if (adversarial_temperature > kEpsilon)
+                if (adversarial_temperature > kEpsilon) {
                     weight = safe_exp((logit - bias) / adversarial_temperature) / normalizer;
+                    // the normalizer may be out of date in ASGD
+                    // so we need to clip the weight
+                    weight = min(weight, Float(1));
+                }
                 else
                     weight = 1.0 / num_negative;
                 sample_loss += weight * -log(1 - prob + kEpsilon);
diff --git a/python/graphvite/application/application.py b/python/graphvite/application/application.py
@@ -883,7 +883,7 @@ def get_batch_size(self, sample_size):
         mem_per_sample = sample_size * (2 * 3 * np.uint32().itemsize + 1 * np.uint64().itemsize)
         max_batch_size = int(memory.available / mem_per_sample / self.MEMORY_SCALE_FACTOR)
         if max_batch_size < batch_size:
-            logger.info("Memory is not enough for optimal prediction batch size."
+            logger.info("Memory is not enough for optimal prediction batch size. "
                         "Use the maximal possible size instead.")
             batch_size = max_batch_size
         return batch_size