commit c2d0bf30ea0718f0f415af6b6b7dfc232b6e4706
Author: Radosław Piliszek <radoslaw.piliszek@gmail.com>
Date:   Mon Apr 27 10:40:15 2020 +0200

    Coordinate haproxy and keepalived restarts
    
    Keepalived and haproxy cooperate to provide control plane HA in
    kolla-ansible deployments.
    Certain care should be exerted to avoid prolonged availability
    loss during reconfigurations and upgrades.
    This patch aims to provide this care.
    There is nothing special about keepalived upgrade compared to
    reconfig, hence it is simplified to run the same code as for
    deploy.
    The broken logic of safe upgrade is replaced by common handler
    code which's goal is to ensure we down current master only after
    we have backups ready.
    
    This change introduces a switch to kolla_docker module that allows
    to ignore missing containers (as they are logically stopped).
    ignore_missing is the switch's name.
    All tests are included.
    
    Change-Id: I22ddec5f7ee4a7d3d502649a158a7e005fe29c48

diff --git a/ansible/library/kolla_docker.py b/ansible/library/kolla_docker.py
index edead35..224c424 100644
--- a/ansible/library/kolla_docker.py
+++ b/ansible/library/kolla_docker.py
@@ -936,8 +936,10 @@ class DockerWorker(object):
             graceful_timeout = 10
         container = self.check_container()
         if not container:
-            self.module.fail_json(
-                msg="No such container: {} to stop".format(name))
+            ignore_missing = self.params.get('ignore_missing')
+            if not ignore_missing:
+                self.module.fail_json(
+                    msg="No such container: {} to stop".format(name))
         elif not container['Status'].startswith('Exited '):
             self.changed = True
             self.dc.stop(name, timeout=graceful_timeout)
@@ -1069,6 +1071,7 @@ def generate_module():
         dimensions=dict(required=False, type='dict', default=dict()),
         tty=dict(required=False, type='bool', default=False),
         client_timeout=dict(required=False, type='int', default=120),
+        ignore_missing=dict(required=False, type='bool', default=False),
     )
     required_if = [
         ['action', 'pull_image', ['image']],
diff --git a/ansible/roles/haproxy/handlers/main.yml b/ansible/roles/haproxy/handlers/main.yml
index 60fa5a0..ca94f34 100644
--- a/ansible/roles/haproxy/handlers/main.yml
+++ b/ansible/roles/haproxy/handlers/main.yml
@@ -1,5 +1,51 @@
 ---
-- name: Restart haproxy container
+# NOTE(yoctozepto): this handler dance is to ensure we delay restarting master
+# keepalived and haproxy which control VIP address until we have working backups.
+# This could be improved by checking if backup keepalived do not report FAULT state.
+# Master node is handled specially to let it close down connections and only then
+# drop the VIP address by stopping keepalived service.
+
+# NOTE(yoctozepto): we need fresh VIP address placement info (facts may be old)
+- name: Check IP addresses on the API interface
+  vars:
+    version: "{{ '6' if api_address_family == 'ipv6' else '4' }}"
+  become: true
+  command: ip -{{ version }} -o addr show dev {{ api_interface }}
+  register: ip_addr_output
+  changed_when: false
+  when:
+    - kolla_action != "config"
+  listen:
+    - Restart haproxy container
+    - Restart keepalived container
+
+- name: Group HA nodes by status
+  vars:
+    re_safe_address: "{{ kolla_internal_vip_address | regex_escape }}"
+  group_by:
+    key: kolla_ha_is_master_{{ ip_addr_output.stdout is regex('\b' + re_safe_address + '\b') }}
+  when:
+    - kolla_action != "config"
+  listen:
+    - Restart haproxy container
+    - Restart keepalived container
+
+- name: Stop backup keepalived container
+  become: true
+  kolla_docker:
+    action: "stop_container"
+    # NOTE(yoctozepto): backup node might not have keepalived yet - ignore
+    ignore_missing: true
+    common_options: "{{ docker_common_options }}"
+    name: "keepalived"
+  when:
+    - kolla_action != "config"
+    - groups.kolla_ha_is_master_False is defined
+    - inventory_hostname in groups.kolla_ha_is_master_False
+  listen:
+    - Restart keepalived container
+
+- name: Restart backup haproxy container
   vars:
     service_name: "haproxy"
     service: "{{ haproxy_services[service_name] }}"
@@ -14,12 +60,20 @@
     dimensions: "{{ service.dimensions }}"
   when:
     - kolla_action != "config"
-    - inventory_hostname in groups[service.group]
-    - service.enabled | bool
+    - groups.kolla_ha_is_master_False is defined
+    - inventory_hostname in groups.kolla_ha_is_master_False
+  listen:
+    - Restart haproxy container
+    - Restart keepalived container
   notify:
-    - Waiting for haproxy to start
+    - Wait for backup haproxy to start
 
-- name: Restart keepalived container
+- name: Wait for backup haproxy to start
+  wait_for:
+    host: "{{ api_interface_address }}"
+    port: "{{ haproxy_monitor_port }}"
+
+- name: Start backup keepalived container
   vars:
     service_name: "keepalived"
     service: "{{ haproxy_services[service_name] }}"
@@ -34,17 +88,92 @@
     dimensions: "{{ service.dimensions }}"
   when:
     - kolla_action != "config"
-    - inventory_hostname in groups[service.group]
-    - service.enabled | bool
+    - groups.kolla_ha_is_master_False is defined
+    - inventory_hostname in groups.kolla_ha_is_master_False
+  listen:
+    - Restart keepalived container
+  notify:
+    - Wait for virtual IP to appear
+
+# NOTE(yoctozepto): This is to ensure haproxy can close any open connections
+# to the VIP address.
+- name: Stop master haproxy container
+  become: true
+  kolla_docker:
+    action: "stop_container"
+    common_options: "{{ docker_common_options }}"
+    name: "haproxy"
+  when:
+    - kolla_action != "config"
+    - groups.kolla_ha_is_master_True is defined
+    - inventory_hostname in groups.kolla_ha_is_master_True
+  listen:
+    - Restart keepalived container
+
+- name: Stop master keepalived container
+  become: true
+  kolla_docker:
+    action: "stop_container"
+    common_options: "{{ docker_common_options }}"
+    name: "keepalived"
+  when:
+    - kolla_action != "config"
+    - groups.kolla_ha_is_master_True is defined
+    - inventory_hostname in groups.kolla_ha_is_master_True
+  listen:
+    - Restart keepalived container
+
+- name: Start master haproxy container
+  vars:
+    service_name: "haproxy"
+    service: "{{ haproxy_services[service_name] }}"
+  become: true
+  kolla_docker:
+    action: "recreate_or_restart_container"
+    common_options: "{{ docker_common_options }}"
+    name: "{{ service.container_name }}"
+    image: "{{ service.image }}"
+    privileged: "{{ service.privileged | default(False) }}"
+    volumes: "{{ service.volumes }}"
+    dimensions: "{{ service.dimensions }}"
+  when:
+    - kolla_action != "config"
+    - groups.kolla_ha_is_master_True is defined
+    - inventory_hostname in groups.kolla_ha_is_master_True
+  listen:
+    - Restart haproxy container
+    - Restart keepalived container
   notify:
-    - Waiting for virtual IP to appear
+    - Wait for master haproxy to start
 
-- name: Waiting for haproxy to start
+- name: Wait for master haproxy to start
   wait_for:
     host: "{{ api_interface_address }}"
     port: "{{ haproxy_monitor_port }}"
 
-- name: Waiting for virtual IP to appear
+- name: Start master keepalived container
+  vars:
+    service_name: "keepalived"
+    service: "{{ haproxy_services[service_name] }}"
+  become: true
+  kolla_docker:
+    action: "recreate_or_restart_container"
+    common_options: "{{ docker_common_options }}"
+    name: "{{ service.container_name }}"
+    image: "{{ service.image }}"
+    privileged: "{{ service.privileged | default(False) }}"
+    volumes: "{{ service.volumes }}"
+    dimensions: "{{ service.dimensions }}"
+  when:
+    - kolla_action != "config"
+    - groups.kolla_ha_is_master_True is defined
+    - inventory_hostname in groups.kolla_ha_is_master_True
+  listen:
+    - Restart keepalived container
+  notify:
+    - Wait for virtual IP to appear
+
+- name: Wait for virtual IP to appear
   wait_for:
     host: "{{ kolla_internal_vip_address }}"
     port: "{{ haproxy_monitor_port }}"
diff --git a/ansible/roles/haproxy/tasks/check-containers.yml b/ansible/roles/haproxy/tasks/check-containers.yml
index a7ce8b4..f572177 100644
--- a/ansible/roles/haproxy/tasks/check-containers.yml
+++ b/ansible/roles/haproxy/tasks/check-containers.yml
@@ -1,5 +1,5 @@
 ---
-- name: Deploy haproxy containers
+- name: Check haproxy containers
   become: true
   kolla_docker:
     action: "compare_container"
diff --git a/ansible/roles/haproxy/tasks/upgrade.yml b/ansible/roles/haproxy/tasks/upgrade.yml
index 17fcbe0..5b10a7e 100644
--- a/ansible/roles/haproxy/tasks/upgrade.yml
+++ b/ansible/roles/haproxy/tasks/upgrade.yml
@@ -1,22 +1,2 @@
 ---
-- import_tasks: config-host.yml
-
-- import_tasks: config.yml
-
-- name: Stopping all slave keepalived containers
-  vars:
-    key: "{{ 'ipv6' if api_address_family == 'ipv6' else 'ipv4_secondaries' }}"
-    addresses: "{{ hostvars[inventory_hostname]['ansible_' + api_interface].get(key, []) | map(attribute='address') | list }}"
-  become: true
-  kolla_docker:
-    action: "stop_container"
-    common_options: "{{ docker_common_options }}"
-    name: "keepalived"
-  when: kolla_internal_vip_address not in addresses
-  notify:
-    - Restart keepalived container
-
-# NOTE(yoctozepto): haproxy role handlers should not be flushed early.
-# site.yml handles all haproxy things in a dedicated play.
-# This is to avoid extra haproxy service restart.
-# See: https://bugs.launchpad.net/kolla-ansible/+bug/1875228
+- import_tasks: deploy.yml
diff --git a/releasenotes/notes/bug-1863510-part-haproxy-0a76829d48818f92.yaml b/releasenotes/notes/bug-1863510-part-haproxy-0a76829d48818f92.yaml
new file mode 100644
index 0000000..f452e7f
--- /dev/null
+++ b/releasenotes/notes/bug-1863510-part-haproxy-0a76829d48818f92.yaml
@@ -0,0 +1,5 @@
+---
+fixes:
+  - |
+    Makes haproxy and keepalived restarts during Kolla-Ansible actions more
+    robust, especially in multinode scenarios (HA).
diff --git a/tests/test_kolla_docker.py b/tests/test_kolla_docker.py
index fae1898..c8dca35 100644
--- a/tests/test_kolla_docker.py
+++ b/tests/test_kolla_docker.py
@@ -94,6 +94,7 @@ class ModuleArgsTest(base.BaseTestCase):
             tty=dict(required=False, type='bool', default=False),
             client_timeout=dict(required=False, type='int', default=120),
             healthcheck=dict(required=False, type='dict'),
+            ignore_missing=dict(required=False, type='bool', default=False),
         )
         required_if = [
             ['action', 'pull_image', ['image']],
@@ -175,7 +176,15 @@ FAKE_DATA = {
          'Image': 'myregistrydomain.com:5000/ubuntu:16.04',
          'ImageID': 'sha256:c5f1cf30',
          'Labels': {},
-         'Names': '/my_container'}
+         'Names': '/my_container'},
+        {'Created': 1463578195,
+         'Status': 'Exited (0) 2 hours ago',
+         'HostConfig': {'NetworkMode': 'default'},
+         'Id': 'e40d8e7188',
+         'Image': 'myregistrydomain.com:5000/ubuntu:16.04',
+         'ImageID': 'sha256:c5f1cf30',
+         'Labels': {},
+         'Names': '/exited_container'},
     ],
 
     'container_inspect': {
@@ -396,6 +405,18 @@ class TestContainer(base.BaseTestCase):
         self.assertTrue(self.dw.changed)
         self.dw.dc.containers.assert_called_once_with(all=True)
         self.dw.dc.stop.assert_called_once_with('my_container', timeout=10)
+        self.dw.module.fail_json.assert_not_called()
+
+    def test_stop_container_already_stopped(self):
+        self.dw = get_DockerWorker({'name': 'exited_container',
+                                    'action': 'stop_container'})
+        self.dw.dc.containers.return_value = self.fake_data['containers']
+        self.dw.stop_container()
+
+        self.assertFalse(self.dw.changed)
+        self.dw.dc.containers.assert_called_once_with(all=True)
+        self.dw.module.fail_json.assert_not_called()
+        self.dw.dc.stop.assert_not_called()
 
     def test_stop_container_not_exists(self):
         self.dw = get_DockerWorker({'name': 'fake_container',
@@ -405,9 +426,22 @@ class TestContainer(base.BaseTestCase):
 
         self.assertFalse(self.dw.changed)
         self.dw.dc.containers.assert_called_once_with(all=True)
+        self.dw.dc.stop.assert_not_called()
         self.dw.module.fail_json.assert_called_once_with(
             msg="No such container: fake_container to stop")
 
+    def test_stop_container_not_exists_ignore_missing(self):
+        self.dw = get_DockerWorker({'name': 'fake_container',
+                                    'action': 'stop_container',
+                                    'ignore_missing': True})
+        self.dw.dc.containers.return_value = self.fake_data['containers']
+        self.dw.stop_container()
+
+        self.assertFalse(self.dw.changed)
+        self.dw.dc.containers.assert_called_once_with(all=True)
+        self.dw.dc.stop.assert_not_called()
+        self.dw.module.fail_json.assert_not_called()
+
     def test_stop_and_remove_container(self):
         self.dw = get_DockerWorker({'name': 'my_container',
                                     'action': 'stop_and_remove_container'})