Comment 6 for bug 1533631

Revision history for this message
TJ (tj) wrote : Re: Failed to renew DHCPv6 lease after suspend

Here's some notes from my digging into the code paths being followed. Using the messages logged by NM to syslog I've been able to reconstruct the call heirarchy. In summary, when the state transitions from "expire" to "done":

nm_dhcp_client_set_state () fires a NM_DHCP_STATE_EXPIRE signal

dhcp6_state_changed () receives NM_DHCP_STATE_EXPIRE signal and calls
   dhcp6_fail (self, FALSE)
      dhcp6_cleanup(self, CLEANUP_TYPE_DECONFIGURE, FALSE)
         nm_dhcp_client_stop ()
            NM_DHCP_CLIENT_GET_CLASS (self)->stop (self, release, priv->duid);

I've not been able to identify why the behaviour appears to have changed recently. Inspecting the changelogs and patches in both isc-dhcp and network-manager hasn't revealed anything.

It definitely seems to be Network Manager's issue since it is killing the dhclient process... however... if dhclient tried to gain a new lease immediately it 'deprefered' the expired lease (thus sending NM additional state change signals) then maybe NM wouldn't kill the process.

Something else I noticed - which misled me for a while - is that the dhclient lease files written by Network Manager for IP4v and IPv6 use different syntax for the timestamps. From 'man 5 dhclient.conf' "LEASE DECLARATIONS" the "renew" "rebind" "expire" dates can have two formats: "default" or "local". This is set by "db-time-format" in the configuration.

default: <weekday> <year>/<month>/<day> <hour>:<minute>:<second>
local: epoch <seconds-since-epoch>; # <day-name> <month-name> <day-number> <hours>:<minutes>:<seconds> <year>

The NM IPv4.lease file uses the "default" format (" renew 4 2016/03/31 06:32:42;") but the IPv6.lease file seems to use "local" ("renew 43200;") but without the "epoch" keyword. The code seems to indicate the manual is confusing the *server* settings with client settings, or else IPv6 is using a different scheme entirely via its 'clientenv_add()' mechanism.

NM generates this config file - there is no 'db-time-format':
---
# Created by NetworkManager

send fqdn.fqdn "hephaestion.lan.iam.tj"; # added by NetworkManager
send fqdn.encoded on;
send fqdn.server-update on;

also request dhcp6.name-servers;
also request dhcp6.domain-search;
also request dhcp6.client-id;
---

The dhcp6 lease file extract (showing the last entry is an expired one:-
$ date --date @$(echo 1459002474+86400 | bc)
Sun Mar 27 15:27:54 BST 2016
---
lease6 {
  interface "wlp3s0";
  ia-na 3b:bd:74:85 {
    starts 1459002474;
    renew 43200;
    rebind 69120;
    iaaddr 2a02:8011:2007::2 {
      starts 1459002474;
      preferred-life 86400;
      max-life 86400;
    }
  }
---

---
Notes from the source-code analysis.

      static const GEnumValue values[] = {
        { NM_DHCP_STATE_UNKNOWN, "NM_DHCP_STATE_UNKNOWN", "nm-dhcp-state-unknown" },
        { NM_DHCP_STATE_BOUND, "NM_DHCP_STATE_BOUND", "nm-dhcp-state-bound" },
        { NM_DHCP_STATE_TIMEOUT, "NM_DHCP_STATE_TIMEOUT", "nm-dhcp-state-timeout" },
        { NM_DHCP_STATE_DONE, "NM_DHCP_STATE_DONE", "nm-dhcp-state-done" },
        { NM_DHCP_STATE_EXPIRE, "NM_DHCP_STATE_EXPIRE", "nm-dhcp-state-expire" },
        { NM_DHCP_STATE_FAIL, "NM_DHCP_STATE_FAIL", "nm-dhcp-state-fail" },
        { __NM_DHCP_STATE_MAX, "__NM_DHCP_STATE_MAX", "--nm-dhcp-state-max" },
        { NM_DHCP_STATE_MAX, "NM_DHCP_STATE_MAX", "nm-dhcp-state-max" },
        { 0, NULL, NULL }
      };

client_start ()
\-- g_signal_connect (client, NM_DHCP_CLIENT_SIGNAL_STATE_CHANGED, G_CALLBACK (client_state_changed), self);

client_state_changed ()
\-- if (state >= NM_DHCP_STATE_TIMEOUT)
         remove_client (self, client);
          \-- g_signal_handlers_disconnect_by_func (client, client_state_changed, self);

                /* Stopping the client is left up to the controlling device
                * explicitly since we may want to quit NetworkManager but not terminate
                * the DHCP client.
                */

               g_hash_table_remove (NM_DHCP_MANAGER_GET_PRIVATE (self)->clients, client);

dhcp6_start_with_link_ready ()
\-- priv->dhcp6_client = nm_dhcp_manager_start_ip6
      if (priv->dhcp6_client) {
        priv->dhcp6_state_sigid = g_signal_connect (priv->dhcp6_client,
                                                NM_DHCP_CLIENT_SIGNAL_STATE_CHANGED,
                                                G_CALLBACK (dhcp6_state_changed),
                                                self);
      }

dhcp6_state_changed ()
  switch (state) {
...
   case NM_DHCP_STATE_EXPIRE:
    /* Ignore expiry before we even have a lease (NAK, old lease, etc) */
    if (priv->ip6_state != IP_CONF)
      dhcp6_fail (self, FALSE);
       \-- dhcp6_cleanup(self, CLEANUP_TYPE_DECONFIGURE, FALSE)
                if ( cleanup_type == CLEANUP_TYPE_DECONFIGURE || cleanup_type == CLEANUP_TYPE_REMOVED)
                   nm_dhcp_client_stop ()
                   \-- NM_DHCP_CLIENT_GET_CLASS (self)->stop (self, release, priv->duid);
                        nm_log_info (LOGD_DHCP, "(%s): canceled DHCP transaction, DHCP client pid %d"
             if (priv->dhcp6_mode == NM_RDISC_DHCP_LEVEL_MANAGED) {
                if (timeout || (priv->ip6_state == IP_CONF))
...
                else if (priv->ip6_state == IP_DONE)
                    nm_device_state_changed (self, NM_DEVICE_STATE_FAILED, NM_DEVICE_STATE_REASON_IP_CONFIG_EXPIRED);
...
                             nm_dhcp_client_set_state (self, NM_DHCP_STATE_DONE, NULL, NULL);
    break;
...
   case NM_DHCP_STATE_FAIL:
     dhcp6_fail (self, FALSE);

nm_dhcp_client_set_state ()
\-- if (new_state >= NM_DHCP_STATE_BOUND)
          timeout_cleanup (self);
       if (new_state >= NM_DHCP_STATE_TIMEOUT)
          watch_cleanup (self);
...
       nm_log_info (priv->ipv6 ? LOGD_DHCP6 : LOGD_DHCP4,
               "(%s): DHCPv%c state changed %s -> %s",
               priv->iface,
               priv->ipv6 ? '6' : '4',
               state_to_string (priv->state),
               state_to_string (new_state));

       priv->state = new_state;
       g_signal_emit (G_OBJECT (self),
                 signals[SIGNAL_STATE_CHANGED], 0,
                 new_state,
                 ip_config,
                 options);

nm_device_state_changed (self, state, reason)
    _set_state_full (self, state, reason, FALSE); # _set_state_full (self, state, reason, quitting)
    \-- switch (state)
...
             case NM_DEVICE_STATE_ACTIVATED:
                _LOGI (LOGD_DEVICE, "Activation: successful, device activated.");
                nm_dispatcher_call (DISPATCHER_ACTION_UP, nm_act_request_get_connection (req), self, NULL, NULL, NULL);
                break;
             case NM_DEVICE_STATE_FAILED:
                if (nm_device_uses_assumed_connection (self)) {
                    /* Avoid tearing down assumed connection, assume it's connected */
                    nm_device_queue_state (self,
                             NM_DEVICE_STATE_ACTIVATED,
                             NM_DEVICE_STATE_REASON_CONNECTION_ASSUMED);
                    break;
                }