commit c2d0bf30ea0718f0f415af6b6b7dfc232b6e4706 Author: Radosław Piliszek Date: Mon Apr 27 10:40:15 2020 +0200 Coordinate haproxy and keepalived restarts Keepalived and haproxy cooperate to provide control plane HA in kolla-ansible deployments. Certain care should be exerted to avoid prolonged availability loss during reconfigurations and upgrades. This patch aims to provide this care. There is nothing special about keepalived upgrade compared to reconfig, hence it is simplified to run the same code as for deploy. The broken logic of safe upgrade is replaced by common handler code which's goal is to ensure we down current master only after we have backups ready. This change introduces a switch to kolla_docker module that allows to ignore missing containers (as they are logically stopped). ignore_missing is the switch's name. All tests are included. Change-Id: I22ddec5f7ee4a7d3d502649a158a7e005fe29c48 diff --git a/ansible/library/kolla_docker.py b/ansible/library/kolla_docker.py index edead35..224c424 100644 --- a/ansible/library/kolla_docker.py +++ b/ansible/library/kolla_docker.py @@ -936,8 +936,10 @@ class DockerWorker(object): graceful_timeout = 10 container = self.check_container() if not container: - self.module.fail_json( - msg="No such container: {} to stop".format(name)) + ignore_missing = self.params.get('ignore_missing') + if not ignore_missing: + self.module.fail_json( + msg="No such container: {} to stop".format(name)) elif not container['Status'].startswith('Exited '): self.changed = True self.dc.stop(name, timeout=graceful_timeout) @@ -1069,6 +1071,7 @@ def generate_module(): dimensions=dict(required=False, type='dict', default=dict()), tty=dict(required=False, type='bool', default=False), client_timeout=dict(required=False, type='int', default=120), + ignore_missing=dict(required=False, type='bool', default=False), ) required_if = [ ['action', 'pull_image', ['image']], diff --git a/ansible/roles/haproxy/handlers/main.yml b/ansible/roles/haproxy/handlers/main.yml index 60fa5a0..ca94f34 100644 --- a/ansible/roles/haproxy/handlers/main.yml +++ b/ansible/roles/haproxy/handlers/main.yml @@ -1,5 +1,51 @@ --- -- name: Restart haproxy container +# NOTE(yoctozepto): this handler dance is to ensure we delay restarting master +# keepalived and haproxy which control VIP address until we have working backups. +# This could be improved by checking if backup keepalived do not report FAULT state. +# Master node is handled specially to let it close down connections and only then +# drop the VIP address by stopping keepalived service. + +# NOTE(yoctozepto): we need fresh VIP address placement info (facts may be old) +- name: Check IP addresses on the API interface + vars: + version: "{{ '6' if api_address_family == 'ipv6' else '4' }}" + become: true + command: ip -{{ version }} -o addr show dev {{ api_interface }} + register: ip_addr_output + changed_when: false + when: + - kolla_action != "config" + listen: + - Restart haproxy container + - Restart keepalived container + +- name: Group HA nodes by status + vars: + re_safe_address: "{{ kolla_internal_vip_address | regex_escape }}" + group_by: + key: kolla_ha_is_master_{{ ip_addr_output.stdout is regex('\b' + re_safe_address + '\b') }} + when: + - kolla_action != "config" + listen: + - Restart haproxy container + - Restart keepalived container + +- name: Stop backup keepalived container + become: true + kolla_docker: + action: "stop_container" + # NOTE(yoctozepto): backup node might not have keepalived yet - ignore + ignore_missing: true + common_options: "{{ docker_common_options }}" + name: "keepalived" + when: + - kolla_action != "config" + - groups.kolla_ha_is_master_False is defined + - inventory_hostname in groups.kolla_ha_is_master_False + listen: + - Restart keepalived container + +- name: Restart backup haproxy container vars: service_name: "haproxy" service: "{{ haproxy_services[service_name] }}" @@ -14,12 +60,20 @@ dimensions: "{{ service.dimensions }}" when: - kolla_action != "config" - - inventory_hostname in groups[service.group] - - service.enabled | bool + - groups.kolla_ha_is_master_False is defined + - inventory_hostname in groups.kolla_ha_is_master_False + listen: + - Restart haproxy container + - Restart keepalived container notify: - - Waiting for haproxy to start + - Wait for backup haproxy to start -- name: Restart keepalived container +- name: Wait for backup haproxy to start + wait_for: + host: "{{ api_interface_address }}" + port: "{{ haproxy_monitor_port }}" + +- name: Start backup keepalived container vars: service_name: "keepalived" service: "{{ haproxy_services[service_name] }}" @@ -34,17 +88,92 @@ dimensions: "{{ service.dimensions }}" when: - kolla_action != "config" - - inventory_hostname in groups[service.group] - - service.enabled | bool + - groups.kolla_ha_is_master_False is defined + - inventory_hostname in groups.kolla_ha_is_master_False + listen: + - Restart keepalived container + notify: + - Wait for virtual IP to appear + +# NOTE(yoctozepto): This is to ensure haproxy can close any open connections +# to the VIP address. +- name: Stop master haproxy container + become: true + kolla_docker: + action: "stop_container" + common_options: "{{ docker_common_options }}" + name: "haproxy" + when: + - kolla_action != "config" + - groups.kolla_ha_is_master_True is defined + - inventory_hostname in groups.kolla_ha_is_master_True + listen: + - Restart keepalived container + +- name: Stop master keepalived container + become: true + kolla_docker: + action: "stop_container" + common_options: "{{ docker_common_options }}" + name: "keepalived" + when: + - kolla_action != "config" + - groups.kolla_ha_is_master_True is defined + - inventory_hostname in groups.kolla_ha_is_master_True + listen: + - Restart keepalived container + +- name: Start master haproxy container + vars: + service_name: "haproxy" + service: "{{ haproxy_services[service_name] }}" + become: true + kolla_docker: + action: "recreate_or_restart_container" + common_options: "{{ docker_common_options }}" + name: "{{ service.container_name }}" + image: "{{ service.image }}" + privileged: "{{ service.privileged | default(False) }}" + volumes: "{{ service.volumes }}" + dimensions: "{{ service.dimensions }}" + when: + - kolla_action != "config" + - groups.kolla_ha_is_master_True is defined + - inventory_hostname in groups.kolla_ha_is_master_True + listen: + - Restart haproxy container + - Restart keepalived container notify: - - Waiting for virtual IP to appear + - Wait for master haproxy to start -- name: Waiting for haproxy to start +- name: Wait for master haproxy to start wait_for: host: "{{ api_interface_address }}" port: "{{ haproxy_monitor_port }}" -- name: Waiting for virtual IP to appear +- name: Start master keepalived container + vars: + service_name: "keepalived" + service: "{{ haproxy_services[service_name] }}" + become: true + kolla_docker: + action: "recreate_or_restart_container" + common_options: "{{ docker_common_options }}" + name: "{{ service.container_name }}" + image: "{{ service.image }}" + privileged: "{{ service.privileged | default(False) }}" + volumes: "{{ service.volumes }}" + dimensions: "{{ service.dimensions }}" + when: + - kolla_action != "config" + - groups.kolla_ha_is_master_True is defined + - inventory_hostname in groups.kolla_ha_is_master_True + listen: + - Restart keepalived container + notify: + - Wait for virtual IP to appear + +- name: Wait for virtual IP to appear wait_for: host: "{{ kolla_internal_vip_address }}" port: "{{ haproxy_monitor_port }}" diff --git a/ansible/roles/haproxy/tasks/check-containers.yml b/ansible/roles/haproxy/tasks/check-containers.yml index a7ce8b4..f572177 100644 --- a/ansible/roles/haproxy/tasks/check-containers.yml +++ b/ansible/roles/haproxy/tasks/check-containers.yml @@ -1,5 +1,5 @@ --- -- name: Deploy haproxy containers +- name: Check haproxy containers become: true kolla_docker: action: "compare_container" diff --git a/ansible/roles/haproxy/tasks/upgrade.yml b/ansible/roles/haproxy/tasks/upgrade.yml index 17fcbe0..5b10a7e 100644 --- a/ansible/roles/haproxy/tasks/upgrade.yml +++ b/ansible/roles/haproxy/tasks/upgrade.yml @@ -1,22 +1,2 @@ --- -- import_tasks: config-host.yml - -- import_tasks: config.yml - -- name: Stopping all slave keepalived containers - vars: - key: "{{ 'ipv6' if api_address_family == 'ipv6' else 'ipv4_secondaries' }}" - addresses: "{{ hostvars[inventory_hostname]['ansible_' + api_interface].get(key, []) | map(attribute='address') | list }}" - become: true - kolla_docker: - action: "stop_container" - common_options: "{{ docker_common_options }}" - name: "keepalived" - when: kolla_internal_vip_address not in addresses - notify: - - Restart keepalived container - -# NOTE(yoctozepto): haproxy role handlers should not be flushed early. -# site.yml handles all haproxy things in a dedicated play. -# This is to avoid extra haproxy service restart. -# See: https://bugs.launchpad.net/kolla-ansible/+bug/1875228 +- import_tasks: deploy.yml diff --git a/releasenotes/notes/bug-1863510-part-haproxy-0a76829d48818f92.yaml b/releasenotes/notes/bug-1863510-part-haproxy-0a76829d48818f92.yaml new file mode 100644 index 0000000..f452e7f --- /dev/null +++ b/releasenotes/notes/bug-1863510-part-haproxy-0a76829d48818f92.yaml @@ -0,0 +1,5 @@ +--- +fixes: + - | + Makes haproxy and keepalived restarts during Kolla-Ansible actions more + robust, especially in multinode scenarios (HA). diff --git a/tests/test_kolla_docker.py b/tests/test_kolla_docker.py index fae1898..c8dca35 100644 --- a/tests/test_kolla_docker.py +++ b/tests/test_kolla_docker.py @@ -94,6 +94,7 @@ class ModuleArgsTest(base.BaseTestCase): tty=dict(required=False, type='bool', default=False), client_timeout=dict(required=False, type='int', default=120), healthcheck=dict(required=False, type='dict'), + ignore_missing=dict(required=False, type='bool', default=False), ) required_if = [ ['action', 'pull_image', ['image']], @@ -175,7 +176,15 @@ FAKE_DATA = { 'Image': 'myregistrydomain.com:5000/ubuntu:16.04', 'ImageID': 'sha256:c5f1cf30', 'Labels': {}, - 'Names': '/my_container'} + 'Names': '/my_container'}, + {'Created': 1463578195, + 'Status': 'Exited (0) 2 hours ago', + 'HostConfig': {'NetworkMode': 'default'}, + 'Id': 'e40d8e7188', + 'Image': 'myregistrydomain.com:5000/ubuntu:16.04', + 'ImageID': 'sha256:c5f1cf30', + 'Labels': {}, + 'Names': '/exited_container'}, ], 'container_inspect': { @@ -396,6 +405,18 @@ class TestContainer(base.BaseTestCase): self.assertTrue(self.dw.changed) self.dw.dc.containers.assert_called_once_with(all=True) self.dw.dc.stop.assert_called_once_with('my_container', timeout=10) + self.dw.module.fail_json.assert_not_called() + + def test_stop_container_already_stopped(self): + self.dw = get_DockerWorker({'name': 'exited_container', + 'action': 'stop_container'}) + self.dw.dc.containers.return_value = self.fake_data['containers'] + self.dw.stop_container() + + self.assertFalse(self.dw.changed) + self.dw.dc.containers.assert_called_once_with(all=True) + self.dw.module.fail_json.assert_not_called() + self.dw.dc.stop.assert_not_called() def test_stop_container_not_exists(self): self.dw = get_DockerWorker({'name': 'fake_container', @@ -405,9 +426,22 @@ class TestContainer(base.BaseTestCase): self.assertFalse(self.dw.changed) self.dw.dc.containers.assert_called_once_with(all=True) + self.dw.dc.stop.assert_not_called() self.dw.module.fail_json.assert_called_once_with( msg="No such container: fake_container to stop") + def test_stop_container_not_exists_ignore_missing(self): + self.dw = get_DockerWorker({'name': 'fake_container', + 'action': 'stop_container', + 'ignore_missing': True}) + self.dw.dc.containers.return_value = self.fake_data['containers'] + self.dw.stop_container() + + self.assertFalse(self.dw.changed) + self.dw.dc.containers.assert_called_once_with(all=True) + self.dw.dc.stop.assert_not_called() + self.dw.module.fail_json.assert_not_called() + def test_stop_and_remove_container(self): self.dw = get_DockerWorker({'name': 'my_container', 'action': 'stop_and_remove_container'})