[{"begin": "2018-08-16T15:48:49Z", "created": "2018-08-16T17:43:51Z", "end": "2018-08-16T17:43:58Z", "external_desc": "Cloud SQL First Generation instances unavailable in us-central1-a", "modified": "2018-08-17T15:47:54Z", "most-recent-update": {"created": "2018-08-16T17:44:09Z", "modified": "2018-08-16T21:58:48Z", "text": "The issue with Cloud SQL First Generation instance unavailability has been resolved for all affected projects as of Thursday, 2018-08-16 10:38 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-08-16T17:44:09Z"}, "number": 18003, "public": true, "service_key": "cloud-sql", "service_name": "Google Cloud SQL", "severity": "medium", "updates": [{"created": "2018-08-16T17:44:09Z", "modified": "2018-08-16T21:58:48Z", "text": "The issue with Cloud SQL First Generation instance unavailability has been resolved for all affected projects as of Thursday, 2018-08-16 10:38 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-08-16T17:44:09Z"}, {"created": "2018-08-16T17:43:57Z", "modified": "2018-08-17T15:47:53Z", "text": "We've received a report of an issue with Google Cloud SQL as of Thursday, 2018-08-16 08:48 US/Pacific. We will provide more information by Thursday, 2018-08-16 11:00 US/Pacific.", "when": "2018-08-16T15:49:57Z"}], "uri": "/incident/cloud-sql/18003"}, {"begin": "2018-08-15T01:20:44Z", "created": "2018-08-15T01:20:45Z", "end": "2018-08-15T06:03:09Z", "external_desc": "Instances using Local SSD might experience VM failures. This affects GCE VMs globally. No data corruption has been observed.", "modified": "2018-08-15T06:03:09Z", "most-recent-update": {"created": "2018-08-15T06:03:09Z", "modified": "2018-08-15T06:03:09Z", "text": "The issue with GCE instances using Local SSD is believed to be affecting a very small number of projects and our Engineering Team is working on it. If you have questions or are impacted, please open a case with the Support Team and we will work with you until this issue is resolved. No further updates\r\nwill be provided here.", "when": "2018-08-15T06:03:09Z"}, "number": 18009, "public": true, "service_key": "compute", "service_name": "Google Compute Engine", "severity": "medium", "updates": [{"created": "2018-08-15T06:03:09Z", "modified": "2018-08-15T06:03:09Z", "text": "The issue with GCE instances using Local SSD is believed to be affecting a very small number of projects and our Engineering Team is working on it. If you have questions or are impacted, please open a case with the Support Team and we will work with you until this issue is resolved. No further updates\r\nwill be provided here.", "when": "2018-08-15T06:03:09Z"}, {"created": "2018-08-15T02:48:28Z", "modified": "2018-08-15T02:48:28Z", "text": "The issue with GCE instances using Local SSD is still ongoing. Our engineering team has a mitigation plan in place and is working to implement it. We will provide more information by Tuesday, 2018-08-14 23:00 US/Pacific", "when": "2018-08-15T02:48:28Z"}, {"created": "2018-08-15T01:20:53Z", "modified": "2018-08-15T01:20:53Z", "text": "Instances using Local SSD might experience VM failures. This affects GCE VMs globally. No data corruption has been observed. Our engineering team is working to mitigate the issue. We will provide more information by Tuesday, 2018-08-14 19:30 US/Pacific.", "when": "2018-08-15T01:20:53Z"}, {"created": "2018-08-15T01:20:47Z", "modified": "2018-08-15T01:20:47Z", "text": "Instances using Local SSD might experience VM failures.", "when": "2018-08-15T01:20:47Z"}], "uri": "/incident/compute/18009"}, {"begin": "2018-08-07T00:38:47Z", "created": "2018-08-07T03:29:51Z", "end": "2018-08-07T03:36:27Z", "external_desc": "We've received a report of an issue with Google Cloud SQL. Some customers are unable to connect to their CloudSQL Postgres instances.", "modified": "2018-08-07T03:40:08Z", "most-recent-update": {"created": "2018-08-07T03:36:28Z", "modified": "2018-08-07T03:36:28Z", "text": "The issue with customers losing connections to their CloudSQL Postgres instances has been resolved for all affected users as of Monday, 2018-08-06 19:57 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-08-07T03:36:28Z"}, "number": 18001, "public": true, "service_key": "cloud-sql", "service_name": "Google Cloud SQL", "severity": "medium", "updates": [{"created": "2018-08-07T03:36:28Z", "modified": "2018-08-07T03:36:28Z", "text": "The issue with customers losing connections to their CloudSQL Postgres instances has been resolved for all affected users as of Monday, 2018-08-06 19:57 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-08-07T03:36:28Z"}, {"created": "2018-08-07T03:30:28Z", "modified": "2018-08-07T03:30:28Z", "text": "The issue with customers losing connections to their CloudSQL Postgres instances should be resolved for most of projects and we expect a full resolution in the near future. We will provide another status update by Monday, 2018-08-06 21:00 US/Pacific with current details.", "when": "2018-08-07T03:30:28Z"}, {"created": "2018-08-07T03:30:03Z", "modified": "2018-08-07T03:30:03Z", "text": "We've received a report of an issue with Google Cloud SQL. Some customers are unable to connect to their CloudSQL Postgres instances.", "when": "2018-08-07T03:30:02Z"}], "uri": "/incident/cloud-sql/18001"}, {"begin": "2018-08-03T17:12:23Z", "created": "2018-08-03T17:12:24Z", "end": "2018-08-03T18:30:02Z", "external_desc": "We are investigating errors activating Windows and SUSE licenses on Google Compute Engine instances in all regions.", "modified": "2018-08-03T18:58:16Z", "most-recent-update": {"created": "2018-08-03T18:57:05Z", "modified": "2018-08-03T18:57:05Z", "text": "The issue with errors activating Windows and SUSE licenses on Google Compute Engine instances in all regions has been resolved for all affected users as of Friday, 2018-08-03 11:30 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-08-03T18:57:05Z"}, "number": 18008, "public": true, "service_key": "compute", "service_name": "Google Compute Engine", "severity": "medium", "updates": [{"created": "2018-08-03T18:57:05Z", "modified": "2018-08-03T18:57:05Z", "text": "The issue with errors activating Windows and SUSE licenses on Google Compute Engine instances in all regions has been resolved for all affected users as of Friday, 2018-08-03 11:30 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-08-03T18:57:05Z"}, {"created": "2018-08-03T18:26:45Z", "modified": "2018-08-03T18:26:45Z", "text": "We are still seeing errors activating Windows and SUSE licenses on Google Compute Engine instances in all regions. Our Engineering Team is investigating possible causes. We will provide another status update by Friday, 2018-08-03 12:30 US/Pacific with current details.", "when": "2018-08-03T18:26:45Z"}, {"created": "2018-08-03T17:18:41Z", "modified": "2018-08-03T17:18:41Z", "text": "We will provide more information by Friday, 2018-08-03 11:30 US/Pacific.", "when": "2018-08-03T17:18:41Z"}, {"created": "2018-08-03T17:12:25Z", "modified": "2018-08-03T17:12:25Z", "text": "We are investigating errors activating Windows and SUSE licenses on Google Compute Engine instances in all regions. Our Engineering Team is investigating possible causes. We will provide more information by Friday, 2018-08-03 10:30 US/Pacific.", "when": "2018-08-03T17:12:25Z"}], "uri": "/incident/compute/18008"}, {"begin": "2018-08-03T06:30:26Z", "created": "2018-08-03T12:28:28Z", "end": "2018-08-03T12:00:45Z", "external_desc": "We are investigating an issue with Dialogflow API. We will provide more information by Friday, 2018-08-03 06:30 US/Pacific.", "modified": "2018-08-06T01:24:30Z", "most-recent-update": {"created": "2018-08-03T13:10:49Z", "modified": "2018-08-03T13:10:49Z", "text": "The issue with Dialogflow API returning 502 for detectintent requests has been resolved for all affected projects as of Friday, 2018-08-03 04:50 US/Pacific.\r\nWe will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-08-03T13:10:49Z"}, "number": 18001, "public": true, "service_key": "cloud-ml", "service_name": "Cloud Machine Learning", "severity": "high", "updates": [{"created": "2018-08-03T13:10:49Z", "modified": "2018-08-03T13:10:49Z", "text": "The issue with Dialogflow API returning 502 for detectintent requests has been resolved for all affected projects as of Friday, 2018-08-03 04:50 US/Pacific.\r\nWe will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-08-03T13:10:49Z"}, {"created": "2018-08-03T12:28:31Z", "modified": "2018-08-03T12:28:31Z", "text": "We are investigating an issue with Dialogflow API. We will provide more information by Friday, 2018-08-03 06:00 US/Pacific.", "when": "2018-08-03T12:28:31Z"}], "uri": "/incident/cloud-ml/18001"}, {"begin": "2018-07-31T13:41:27Z", "created": "2018-07-31T13:41:28Z", "end": "2018-07-31T16:19:40Z", "external_desc": "Traffic loss in region europe-west2", "modified": "2018-07-31T16:19:44Z", "most-recent-update": {"created": "2018-07-31T16:19:44Z", "modified": "2018-07-31T16:19:44Z", "text": "The issue with traffic loss in europe-west2 has been resolved for all affected projects as of Tuesday, 2018-07-31 07:26 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-07-31T16:19:44Z"}, "number": 18014, "public": true, "service_key": "cloud-networking", "service_name": "Google Cloud Networking", "severity": "high", "updates": [{"created": "2018-07-31T16:19:44Z", "modified": "2018-07-31T16:19:44Z", "text": "The issue with traffic loss in europe-west2 has been resolved for all affected projects as of Tuesday, 2018-07-31 07:26 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-07-31T16:19:44Z"}, {"created": "2018-07-31T15:53:44Z", "modified": "2018-07-31T15:53:44Z", "text": "Mitigation work is currently underway by our Engineering Team. We will provide another status update by Tuesday, 2018-07-31 10:00 US/Pacific with current details.", "when": "2018-07-31T15:53:44Z"}, {"created": "2018-07-31T14:45:46Z", "modified": "2018-07-31T14:45:46Z", "text": "The issue with traffic loss in GCP region europe-west2 should be mitigated and we expect a full resolution in the near future. We will provide another status update by Tuesday, 2018-07-31 08:45 US/Pacific with current details.", "when": "2018-07-31T14:45:46Z"}, {"created": "2018-07-31T14:02:37Z", "modified": "2018-07-31T14:02:37Z", "text": "We are experiencing traffic loss in GCP region europe-west2 beginning at Tuesday, 2018-07-31 06:45 US/Pacific. Early investigation indicate that approximately 20% of requests in this region are affected by this issue. For everyone who is affected, we apologize for the disruption. We will provide an update by Tuesday, 2018-07-31 07:45 US/Pacific with current details.", "when": "2018-07-31T14:02:37Z"}, {"created": "2018-07-31T13:41:29Z", "modified": "2018-07-31T13:41:29Z", "text": "We are investigating an issue with traffic loss in region europe-west2. We will provide more information by Tuesday, 2018-07-31 07:00 US/Pacific.", "when": "2018-07-31T13:41:29Z"}], "uri": "/incident/cloud-networking/18014"}, {"begin": "2018-07-28T01:27:32Z", "created": "2018-07-28T01:55:35Z", "end": "2018-07-28T02:31:24Z", "external_desc": "We are investigating issues with Internet access for VMs in the europe-west4 region.", "modified": "2018-08-07T21:53:11Z", "most-recent-update": {"created": "2018-08-07T21:51:08Z", "modified": "2018-08-07T21:53:10Z", "text": "ISSUE SUMMARY\r\n\r\nOn Friday 27 July 2018, for a duration of 1 hour 4 minutes, Google Compute Engine (GCE) instances and Cloud VPN tunnels in europe-west4 experienced loss of connectivity to the Internet. The incident affected all new or recently live migrated GCE instances. VPN tunnels created during the incident were also impacted. We apologize to our customers whose services or businesses were impacted during this incident, and we are taking immediate steps to avoid a recurrence.\r\n\r\nDETAILED DESCRIPTION OF IMPACT\r\n\r\nAll Google Compute Engine (GCE) instances in europe-west4 created on Friday 27 July 2018 from 18:27 to 19:31 PDT lost connectivity to the Internet and other instances via their public IP addresses. Additionally any instances that live migrated during the outage period would have lost connectivity for approximately 30 minutes after the live migration completed. All Cloud VPN tunnels created during the impact period, and less than 1% of existing tunnels in europe-west4 also lost external connectivity. All other instances and VPN tunnels continued to serve traffic. Inter-instance traffic via private IP addresses remained unaffected. \r\n\r\nROOT CAUSE\r\n\r\nGoogle's datacenters utilize software load balancers known as Maglevs [1] to efficiently load balance network traffic [2] across service backends. The issue was caused by an unintended side effect of a configuration change made to jobs that are critical in coordinating the availability of Maglevs. The change unintentionally lowered the priority of these jobs in europe-west4. The issue was subsequently triggered when a datacenter maintenance event required load shedding of low priority jobs. This resulted in failure of a portion of the Maglev load balancers. However, a safeguard in the network control plane ensured that some Maglev capacity remained available. This layer of our typical defense-in-depth allowed connectivity to extant cloud resources to remain up, and restricted the disruption to new or migrated GCE instances and Cloud VPN tunnels. \r\n\r\n\r\nREMEDIATION AND PREVENTION\r\n\r\nAutomated monitoring alerted Google\u2019s engineering team to the event within 5 minutes and they immediately began investigating at 18:36. At 19:25 the team discovered the root cause and started reverting the configuration change. The issue was mitigated at 19:31 when the fix was rolled out. At this point, connectivity was restored immediately.\r\n\r\nIn addition to addressing the root cause, we will be implementing changes to both prevent and reduce the impact of this type of failure by improving our alerting when too many Maglevs become unavailable, and adding a check for configuration changes to detect priority reductions on critical dependencies. \r\n\r\nWe would again like to apologize for the impact that this incident had on our customers and their businesses in the europe-west4 region. We are conducting a detailed post-mortem to ensure that all the root and contributing causes of this event are understood and addressed promptly.\r\n\r\n\r\n[1] https://ai.google/research/pubs/pub44824  \r\n\r\n\r\n[2] https://cloudplatform.googleblog.com/2016/03/Google-shares-software-network-load-balancer-design-powering-GCP-networking.html", "when": "2018-08-07T21:51:08Z"}, "number": 18013, "public": true, "service_key": "cloud-networking", "service_name": "Google Cloud Networking", "severity": "high", "updates": [{"created": "2018-08-07T21:51:08Z", "modified": "2018-08-07T21:53:10Z", "text": "ISSUE SUMMARY\r\n\r\nOn Friday 27 July 2018, for a duration of 1 hour 4 minutes, Google Compute Engine (GCE) instances and Cloud VPN tunnels in europe-west4 experienced loss of connectivity to the Internet. The incident affected all new or recently live migrated GCE instances. VPN tunnels created during the incident were also impacted. We apologize to our customers whose services or businesses were impacted during this incident, and we are taking immediate steps to avoid a recurrence.\r\n\r\nDETAILED DESCRIPTION OF IMPACT\r\n\r\nAll Google Compute Engine (GCE) instances in europe-west4 created on Friday 27 July 2018 from 18:27 to 19:31 PDT lost connectivity to the Internet and other instances via their public IP addresses. Additionally any instances that live migrated during the outage period would have lost connectivity for approximately 30 minutes after the live migration completed. All Cloud VPN tunnels created during the impact period, and less than 1% of existing tunnels in europe-west4 also lost external connectivity. All other instances and VPN tunnels continued to serve traffic. Inter-instance traffic via private IP addresses remained unaffected. \r\n\r\nROOT CAUSE\r\n\r\nGoogle's datacenters utilize software load balancers known as Maglevs [1] to efficiently load balance network traffic [2] across service backends. The issue was caused by an unintended side effect of a configuration change made to jobs that are critical in coordinating the availability of Maglevs. The change unintentionally lowered the priority of these jobs in europe-west4. The issue was subsequently triggered when a datacenter maintenance event required load shedding of low priority jobs. This resulted in failure of a portion of the Maglev load balancers. However, a safeguard in the network control plane ensured that some Maglev capacity remained available. This layer of our typical defense-in-depth allowed connectivity to extant cloud resources to remain up, and restricted the disruption to new or migrated GCE instances and Cloud VPN tunnels. \r\n\r\n\r\nREMEDIATION AND PREVENTION\r\n\r\nAutomated monitoring alerted Google\u2019s engineering team to the event within 5 minutes and they immediately began investigating at 18:36. At 19:25 the team discovered the root cause and started reverting the configuration change. The issue was mitigated at 19:31 when the fix was rolled out. At this point, connectivity was restored immediately.\r\n\r\nIn addition to addressing the root cause, we will be implementing changes to both prevent and reduce the impact of this type of failure by improving our alerting when too many Maglevs become unavailable, and adding a check for configuration changes to detect priority reductions on critical dependencies. \r\n\r\nWe would again like to apologize for the impact that this incident had on our customers and their businesses in the europe-west4 region. We are conducting a detailed post-mortem to ensure that all the root and contributing causes of this event are understood and addressed promptly.\r\n\r\n\r\n[1] https://ai.google/research/pubs/pub44824  \r\n\r\n\r\n[2] https://cloudplatform.googleblog.com/2016/03/Google-shares-software-network-load-balancer-design-powering-GCP-networking.html", "when": "2018-08-07T21:51:08Z"}, {"created": "2018-07-28T02:44:25Z", "modified": "2018-07-28T02:44:25Z", "text": "The issue with Internet access for VMs in the europe-west4 region has been resolved for all affected projects as of Friday, 2018-07-27 19:45 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-07-28T02:44:25Z"}, {"created": "2018-07-28T02:40:44Z", "modified": "2018-07-28T02:40:44Z", "text": "Mitigation work is currently underway by our Engineering Team. We will provide another status update by Friday, 2018-07-27 20:30 US/Pacific with current details.", "when": "2018-07-28T02:40:44Z"}, {"created": "2018-07-28T02:28:11Z", "modified": "2018-07-28T02:28:11Z", "text": "Our Engineering Team believes they have identified the root cause of the issue and is working to mitigate. We will provide another status update by Friday, 2018-07-27 20:15 US/Pacific with current details.", "when": "2018-07-28T02:28:11Z"}, {"created": "2018-07-28T02:25:25Z", "modified": "2018-07-28T02:25:25Z", "text": "Investigation is currently underway by our Engineering Team. We will provide another status update by Friday, 2018-07-27 20:15 US/Pacific with current details.", "when": "2018-07-28T02:25:25Z"}, {"created": "2018-07-28T01:55:36Z", "modified": "2018-07-28T01:55:36Z", "text": "We are investigating an issue with Google Cloud Networking for VM instances in the europe-west4 region. We will provide more information by Friday, 2018-07-27 19:30 US/Pacific.", "when": "2018-07-28T01:55:36Z"}], "uri": "/incident/cloud-networking/18013"}, {"begin": "2018-07-17T19:17:51Z", "created": "2018-07-17T19:35:51Z", "end": "2018-07-17T19:55:44Z", "external_desc": "We've received a report of an issue with Google App Engine.", "modified": "2018-07-19T02:02:23Z", "most-recent-update": {"created": "2018-07-19T02:02:23Z", "modified": "2018-07-19T02:02:23Z", "text": "A detailed analysis has been written for this incident and is available at cloud networking incident 18012: https://status.cloud.google.com/incident/cloud-networking/18012", "when": "2018-07-19T02:02:23Z"}, "number": 18005, "public": true, "service_key": "appengine", "service_name": "Google App Engine", "severity": "high", "updates": [{"created": "2018-07-19T02:02:23Z", "modified": "2018-07-19T02:02:23Z", "text": "A detailed analysis has been written for this incident and is available at cloud networking incident 18012: https://status.cloud.google.com/incident/cloud-networking/18012", "when": "2018-07-19T02:02:23Z"}, {"created": "2018-07-17T20:23:44Z", "modified": "2018-07-17T20:23:44Z", "text": "The issue with Google App Engine has been resolved for all affected users as of Tuesday, 2018-07-17 13:05 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-07-17T20:23:44Z"}, {"created": "2018-07-17T19:58:50Z", "modified": "2018-07-17T19:58:50Z", "text": "The issue with Google App Engine should be resolved for some users and we expect a full resolution in the near future. We will provide another status update by Tuesday, 2018-07-17 13:30 US/Pacific with current details.", "when": "2018-07-17T19:58:50Z"}, {"created": "2018-07-17T19:35:53Z", "modified": "2018-07-17T19:35:53Z", "text": "We've received a report of an issue with Google App Engine as of Tuesday, 2018-07-17 12:26 US/Pacific. We will provide more information by Tuesday, 2018-07-17 13:00 US/Pacific.", "when": "2018-07-17T19:35:53Z"}, {"created": "2018-07-17T19:35:51Z", "modified": "2018-07-17T19:35:51Z", "text": "We've received a report of an issue with Google App Engine.", "when": "2018-07-17T19:35:51Z"}], "uri": "/incident/appengine/18005"}, {"begin": "2018-07-17T19:17:46Z", "created": "2018-07-17T19:34:48Z", "end": "2018-07-17T19:55:01Z", "external_desc": "The issue with Google Cloud Global Loadbalancers returning 502s has been fully resolved.", "modified": "2018-07-19T00:26:51Z", "most-recent-update": {"created": "2018-07-19T00:26:09Z", "modified": "2018-07-19T00:26:09Z", "text": "## ISSUE SUMMARY\r\nOn Tuesday, 17 July 2018, customers using Google Cloud App Engine, Google HTTP(S) Load Balancer, or TCP/SSL Proxy Load Balancers experienced elevated error rates ranging between 33% and 87% for a duration of 32 minutes. Customers observed errors consisting of either 502 return codes, or connection resets. We apologize to our customers whose services or businesses were impacted during this incident, and we are taking immediate steps to improve the platform\u2019s performance and availability. We will be providing customers with a SLA credit for the affected timeframe that impacted the Google Cloud HTTP(S) Load Balancer, TCP/SSL Proxy Load Balancer and Google App Engine products.\r\n\r\n\r\n## DETAILED DESCRIPTION OF IMPACT\r\nOn Tuesday, 17 July 2018, from 12:17 to 12:49 PDT, Google Cloud HTTP(S) Load Balancers returned 502s for some requests they received. The proportion of 502 return codes varied from 33% to 87% during the period. Automated monitoring alerted Google\u2019s engineering team to the event at 12:19, and at 12:44 the team had identified the probable root cause and deployed a fix. At 12:49 the fix became effective and the rate of 502s rapidly returned to a normal level. Services experienced degraded latency for several minutes longer as traffic returned and caches warmed. Serving fully recovered by 12:55. Connections to Cloud TCP/SSL Proxy Load Balancers would have been reset after connections to backends failed. Cloud services depending upon Cloud HTTP Load Balancing, such as Google App Engine application serving, Google Cloud Functions, Stackdriver's web UI, Dialogflow and the Cloud Support Portal/API, were affected for the duration of the incident.\r\n\r\nCloud CDN cache hits dropped 70% due to decreased references to Cloud CDN URLs from services behind Cloud HTTP(S) Load balancers and an inability to validate stale cache entries or insert new content on cache misses. Services running on Google Kubernetes Engine and using the Ingress resource would have served 502 return codes as mentioned above. Google Cloud Storage traffic served via Cloud Load Balancers was also impacted.\r\n\r\nOther Google Cloud Platform services were not impacted. For example, applications and services that use direct VM access, or Network Load Balancing, were not affected. \r\n\r\n\r\n## ROOT CAUSE\r\nGoogle\u2019s Global Load Balancers are based on a two-tiered architecture of Google Front Ends (GFE). The first tier of GFEs answer requests as close to the user as possible to maximize performance during connection setup. These GFEs route requests to a second layer of GFEs located close to the service which the request makes use of. This type of architecture allows clients to have low latency connections anywhere in the world, while taking advantage of Google\u2019s global network to serve requests to backends, regardless of in which region they are located.\r\n\r\nThe GFE development team was in the process of adding features to GFE to improve security and performance. These features had been introduced into the second layer GFE code base but not yet put into service. One of the features contained a bug which would cause the GFE to restart; this bug had not been detected in either of testing and initial rollout. At the beginning of the event, a configuration change in the production environment triggered the bug intermittently, which caused affected GFEs to repeatedly restart. Since restarts are not instantaneous, the available second layer GFE capacity was reduced. While some requests were correctly answered, other requests were interrupted (leading to connection resets) or denied due to a temporary lack of capacity while the GFEs were coming back online.\r\n\r\n\r\n## REMEDIATION AND PREVENTION\r\nGoogle engineers were alerted to the issue within 3 minutes and began immediately investigating. At 12:44 PDT, the team discovered the root cause, the configuration change was promptly reverted, and the affected GFEs ceased their restarts. As all GFEs returned to service, traffic resumed its normal levels and behavior.\r\n\r\nIn addition to fixing the underlying cause, we will be implementing changes to both prevent and reduce the impact of this type of failure in several ways:\r\n\r\n1\\. We are adding additional safeguards to disable features not yet in service.\r\n\r\n2\\. We plan to increase hardening of the GFE testing stack to reduce the risk of having a latent bug in production binaries that may cause a task to restart.\r\n\r\n3\\. We will also be pursuing additional isolation between different shards of GFE pools in order to reduce the scope of failures.\r\n\r\n4\\. Finally, to speed diagnosis in the future, we plan to create a consolidated dashboard of all configuration changes for GFE pools, allowing engineers to more easily and quickly observe, correlate, and identify problematic changes to the system.\r\n\r\nWe would again like to apologize for the impact that this incident had on our customers and their businesses. We take any incident that affects the availability and reliability of our customers extremely seriously, particularly incidents which span regions. We are conducting a thorough investigation of the incident and will be making the changes which result from that investigation our very top priority in GCP engineering.", "when": "2018-07-19T00:26:08Z"}, "number": 18012, "public": true, "service_key": "cloud-networking", "service_name": "Google Cloud Networking", "severity": "high", "updates": [{"created": "2018-07-19T00:26:09Z", "modified": "2018-07-19T00:26:09Z", "text": "## ISSUE SUMMARY\r\nOn Tuesday, 17 July 2018, customers using Google Cloud App Engine, Google HTTP(S) Load Balancer, or TCP/SSL Proxy Load Balancers experienced elevated error rates ranging between 33% and 87% for a duration of 32 minutes. Customers observed errors consisting of either 502 return codes, or connection resets. We apologize to our customers whose services or businesses were impacted during this incident, and we are taking immediate steps to improve the platform\u2019s performance and availability. We will be providing customers with a SLA credit for the affected timeframe that impacted the Google Cloud HTTP(S) Load Balancer, TCP/SSL Proxy Load Balancer and Google App Engine products.\r\n\r\n\r\n## DETAILED DESCRIPTION OF IMPACT\r\nOn Tuesday, 17 July 2018, from 12:17 to 12:49 PDT, Google Cloud HTTP(S) Load Balancers returned 502s for some requests they received. The proportion of 502 return codes varied from 33% to 87% during the period. Automated monitoring alerted Google\u2019s engineering team to the event at 12:19, and at 12:44 the team had identified the probable root cause and deployed a fix. At 12:49 the fix became effective and the rate of 502s rapidly returned to a normal level. Services experienced degraded latency for several minutes longer as traffic returned and caches warmed. Serving fully recovered by 12:55. Connections to Cloud TCP/SSL Proxy Load Balancers would have been reset after connections to backends failed. Cloud services depending upon Cloud HTTP Load Balancing, such as Google App Engine application serving, Google Cloud Functions, Stackdriver's web UI, Dialogflow and the Cloud Support Portal/API, were affected for the duration of the incident.\r\n\r\nCloud CDN cache hits dropped 70% due to decreased references to Cloud CDN URLs from services behind Cloud HTTP(S) Load balancers and an inability to validate stale cache entries or insert new content on cache misses. Services running on Google Kubernetes Engine and using the Ingress resource would have served 502 return codes as mentioned above. Google Cloud Storage traffic served via Cloud Load Balancers was also impacted.\r\n\r\nOther Google Cloud Platform services were not impacted. For example, applications and services that use direct VM access, or Network Load Balancing, were not affected. \r\n\r\n\r\n## ROOT CAUSE\r\nGoogle\u2019s Global Load Balancers are based on a two-tiered architecture of Google Front Ends (GFE). The first tier of GFEs answer requests as close to the user as possible to maximize performance during connection setup. These GFEs route requests to a second layer of GFEs located close to the service which the request makes use of. This type of architecture allows clients to have low latency connections anywhere in the world, while taking advantage of Google\u2019s global network to serve requests to backends, regardless of in which region they are located.\r\n\r\nThe GFE development team was in the process of adding features to GFE to improve security and performance. These features had been introduced into the second layer GFE code base but not yet put into service. One of the features contained a bug which would cause the GFE to restart; this bug had not been detected in either of testing and initial rollout. At the beginning of the event, a configuration change in the production environment triggered the bug intermittently, which caused affected GFEs to repeatedly restart. Since restarts are not instantaneous, the available second layer GFE capacity was reduced. While some requests were correctly answered, other requests were interrupted (leading to connection resets) or denied due to a temporary lack of capacity while the GFEs were coming back online.\r\n\r\n\r\n## REMEDIATION AND PREVENTION\r\nGoogle engineers were alerted to the issue within 3 minutes and began immediately investigating. At 12:44 PDT, the team discovered the root cause, the configuration change was promptly reverted, and the affected GFEs ceased their restarts. As all GFEs returned to service, traffic resumed its normal levels and behavior.\r\n\r\nIn addition to fixing the underlying cause, we will be implementing changes to both prevent and reduce the impact of this type of failure in several ways:\r\n\r\n1\\. We are adding additional safeguards to disable features not yet in service.\r\n\r\n2\\. We plan to increase hardening of the GFE testing stack to reduce the risk of having a latent bug in production binaries that may cause a task to restart.\r\n\r\n3\\. We will also be pursuing additional isolation between different shards of GFE pools in order to reduce the scope of failures.\r\n\r\n4\\. Finally, to speed diagnosis in the future, we plan to create a consolidated dashboard of all configuration changes for GFE pools, allowing engineers to more easily and quickly observe, correlate, and identify problematic changes to the system.\r\n\r\nWe would again like to apologize for the impact that this incident had on our customers and their businesses. We take any incident that affects the availability and reliability of our customers extremely seriously, particularly incidents which span regions. We are conducting a thorough investigation of the incident and will be making the changes which result from that investigation our very top priority in GCP engineering.", "when": "2018-07-19T00:26:08Z"}, {"created": "2018-07-17T20:19:02Z", "modified": "2018-07-17T20:19:02Z", "text": "The issue with Google Cloud Global Load balancers returning 502s has been resolved for all affected users as of 13:05 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence. We will provide a more detailed analysis of this incident once we have completed our internal investigation.", "when": "2018-07-17T20:19:02Z"}, {"created": "2018-07-17T19:53:12Z", "modified": "2018-07-17T19:53:12Z", "text": "The issue with Google Cloud Load balancers returning 502s should be resolved for majority of users and we expect a full resolution in the near future. We will provide another status update by Tuesday, 2018-07-17 13:30 US/Pacific with current details.", "when": "2018-07-17T19:53:12Z"}, {"created": "2018-07-17T19:34:50Z", "modified": "2018-07-17T19:34:50Z", "text": "We are investigating a problem with Google Cloud Global Loadbalancers returning 502s for many services including AppEngine, Stackdriver, Dialogflow, as well as customer Global Load Balancers. We will provide another update by Tuesday, 2018-07-17 13:00 US/Pacific", "when": "2018-07-17T19:34:50Z"}, {"created": "2018-07-17T19:34:49Z", "modified": "2018-07-17T19:34:49Z", "text": "We are investigating a problem with Google Cloud Global Loadbalancers returning 502s", "when": "2018-07-17T19:34:49Z"}], "uri": "/incident/cloud-networking/18012"}, {"begin": "2018-07-17T19:17:18Z", "created": "2018-07-17T19:25:19Z", "end": "2018-07-17T19:55:05Z", "external_desc": "We are investigating an issue with Google Stackdriver.", "modified": "2018-07-19T02:02:22Z", "most-recent-update": {"created": "2018-07-19T02:02:22Z", "modified": "2018-07-19T02:02:22Z", "text": "A detailed analysis has been written for this incident and is available at cloud networking incident 18012: https://status.cloud.google.com/incident/cloud-networking/18012", "when": "2018-07-19T02:02:22Z"}, "number": 18009, "public": true, "service_key": "google-stackdriver", "service_name": "Google Stackdriver", "severity": "high", "updates": [{"created": "2018-07-19T02:02:22Z", "modified": "2018-07-19T02:02:22Z", "text": "A detailed analysis has been written for this incident and is available at cloud networking incident 18012: https://status.cloud.google.com/incident/cloud-networking/18012", "when": "2018-07-19T02:02:22Z"}, {"created": "2018-07-17T20:23:17Z", "modified": "2018-07-17T20:23:17Z", "text": "The issue with Google Stackdriver has been resolved for all affected users as of Tuesday, 2018-07-17 13:05 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-07-17T20:23:17Z"}, {"created": "2018-07-17T19:58:43Z", "modified": "2018-07-17T19:58:43Z", "text": "The issue with Google Stackdriver should be resolved for some users and we expect a full resolution in the near future. We will provide another status update by Tuesday, 2018-07-17 13:30 US/Pacific with current details.", "when": "2018-07-17T19:58:43Z"}, {"created": "2018-07-17T19:40:27Z", "modified": "2018-07-17T19:40:27Z", "text": "We are investigating an issue with Google Stackdriver. Affected customers are unable to access all Stackdriver services via stackdriver.com, the Cloud Console, and the API. We will provide more information by Tuesday, 2018-07-17 13:30 US/Pacific.", "when": "2018-07-17T19:40:27Z"}, {"created": "2018-07-17T19:25:19Z", "modified": "2018-07-17T19:25:19Z", "text": "We are investigating an issue with Google Stackdriver. We will provide more information by Tuesday, 2018-07-17 12:45 US/Pacific.", "when": "2018-07-17T19:25:19Z"}], "uri": "/incident/google-stackdriver/18009"}, {"begin": "2018-07-17T19:17:00Z", "created": "2018-07-18T00:49:18Z", "end": "2018-07-17T19:55:00Z", "external_desc": "Support Center inaccessible", "modified": "2018-07-19T02:02:12Z", "most-recent-update": {"created": "2018-07-19T02:02:11Z", "modified": "2018-07-19T02:02:11Z", "text": "A detailed analysis has been written for this incident and is available at cloud networking incident 18012: https://status.cloud.google.com/incident/cloud-networking/18012", "when": "2018-07-19T02:02:11Z"}, "number": 18002, "public": true, "service_key": "support", "service_name": "Google Cloud Support", "severity": "high", "updates": [{"created": "2018-07-19T02:02:11Z", "modified": "2018-07-19T02:02:11Z", "text": "A detailed analysis has been written for this incident and is available at cloud networking incident 18012: https://status.cloud.google.com/incident/cloud-networking/18012", "when": "2018-07-19T02:02:11Z"}, {"created": "2018-07-18T00:49:18Z", "modified": "2018-07-18T00:49:18Z", "text": "The issue with accessing Support Center has been resolved for all affected users as of Tuesday, 2018-07-17 13:10 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence. We will provide a more detailed analysis of this incident once we have completed our internal investigation.", "when": "2018-07-18T00:49:18Z"}], "uri": "/incident/support/18002"}, {"begin": "2018-07-17T08:52:00Z", "created": "2018-07-17T10:25:46Z", "end": "2018-07-17T09:46:07Z", "external_desc": "Stackdriver disturbance", "modified": "2018-07-17T15:56:52Z", "most-recent-update": {"created": "2018-07-17T12:43:07Z", "modified": "2018-07-17T12:43:07Z", "text": "The issue with Cloud Stackdriver, where you could have experienced some latency on the logs, has been resolved for all affected users as of 02:53 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence. We will provide a more detailed analysis of this incident once we have completed our internal investigation.", "when": "2018-07-17T12:43:07Z"}, "number": 18008, "public": true, "service_key": "google-stackdriver", "service_name": "Google Stackdriver", "severity": "high", "updates": [{"created": "2018-07-17T12:43:07Z", "modified": "2018-07-17T12:43:07Z", "text": "The issue with Cloud Stackdriver, where you could have experienced some latency on the logs, has been resolved for all affected users as of 02:53 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence. We will provide a more detailed analysis of this incident once we have completed our internal investigation.", "when": "2018-07-17T12:43:07Z"}, {"created": "2018-07-17T10:25:46Z", "modified": "2018-07-17T10:25:46Z", "text": "The issue with Cloud Stackdriver, where you could have experienced some latency on the logs, has been resolved for all affected users as of 02:53 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence. We will provide a more detailed analysis of this incident once we have completed our internal investigation.", "when": "2018-07-17T10:25:46Z"}], "uri": "/incident/google-stackdriver/18008"}, {"begin": "2018-07-04T11:34:59Z", "created": "2018-07-04T11:35:00Z", "end": "2018-07-04T15:54:25Z", "external_desc": "The issue with with Google Cloud networking in europe-west1-b and europe-west4-b has been resolved.", "modified": "2018-07-04T15:54:25Z", "most-recent-update": {"created": "2018-07-04T15:54:25Z", "modified": "2018-07-04T15:54:25Z", "text": "The issue with VM public IP address traffic in europe-west1-b and europe-west4-b has been resolved for all affected projects as of Wednesday, 2018-07-04 08:53 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-07-04T15:54:25Z"}, "number": 18007, "public": true, "service_key": "compute", "service_name": "Google Compute Engine", "severity": "medium", "updates": [{"created": "2018-07-04T15:54:25Z", "modified": "2018-07-04T15:54:25Z", "text": "The issue with VM public IP address traffic in europe-west1-b and europe-west4-b has been resolved for all affected projects as of Wednesday, 2018-07-04 08:53 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-07-04T15:54:25Z"}, {"created": "2018-07-04T15:38:47Z", "modified": "2018-07-04T15:38:47Z", "text": "We are allocating additional capacity to handle this load. Most of the changes are complete and access controls have been rolled back. The situation is still being closely monitored and further adjustments are still possible/likely. We will provide another status update by Wednesday, 2018-07-04 10:00 US/Pacific with current details.", "when": "2018-07-04T15:38:47Z"}, {"created": "2018-07-04T14:24:33Z", "modified": "2018-07-04T14:24:33Z", "text": "Mitigation work is currently underway by our Engineering Team. We will provide another status update by Wednesday, 2018-07-04 08:30 US/Pacific current details.", "when": "2018-07-04T14:24:33Z"}, {"created": "2018-07-04T13:10:21Z", "modified": "2018-07-04T13:10:21Z", "text": "We are rolling out fix to mitigate this issue. We will provide another status update by Wednesday, 2018-07-04 07:30  US/Pacific with current details.", "when": "2018-07-04T13:10:21Z"}, {"created": "2018-07-04T12:29:35Z", "modified": "2018-07-04T12:29:35Z", "text": "Our Engineering Team believes they have identified the root cause of the networking issue and is working to mitigate it . We will provide another status update by Wednesday, 2018-07-04 06:30 US/Pacific with current details.", "when": "2018-07-04T12:29:35Z"}, {"created": "2018-07-04T11:35:02Z", "modified": "2018-07-04T11:35:02Z", "text": "We've received a report of an issue with Google Cloud networking in europe-west1-b and europe-west4-b. Our Engineering Team is investigating possible causes. We will provide another status update by Wednesday, 2018-07-04 05:30 US/Pacific with current details.", "when": "2018-07-04T11:35:02Z"}, {"created": "2018-07-04T11:35:01Z", "modified": "2018-07-04T11:35:01Z", "text": "We've received a report of an issue with Google Cloud networking in europe-west1-b and europe-west4-b.", "when": "2018-07-04T11:35:01Z"}], "uri": "/incident/compute/18007"}, {"begin": "2018-07-04T07:05:10Z", "created": "2018-07-04T07:05:11Z", "end": "2018-07-04T07:56:13Z", "external_desc": "Issue with Google Cloud Pub/Sub in us-east1.", "modified": "2018-07-04T07:56:14Z", "most-recent-update": {"created": "2018-07-04T07:56:14Z", "modified": "2018-07-04T07:56:14Z", "text": "The issue with Google Cloud Pub/Sub has been resolved for all affected users as of Tuesday, 2018-07-03 00:50 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-07-04T07:56:14Z"}, "number": 18003, "public": true, "service_key": "cloud-pubsub", "service_name": "Google Cloud Pub/Sub", "severity": "medium", "updates": [{"created": "2018-07-04T07:56:14Z", "modified": "2018-07-04T07:56:14Z", "text": "The issue with Google Cloud Pub/Sub has been resolved for all affected users as of Tuesday, 2018-07-03 00:50 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-07-04T07:56:14Z"}, {"created": "2018-07-04T07:05:15Z", "modified": "2018-07-04T07:05:15Z", "text": "Google Cloud Pub/Sub as of Tuesday, 2018-07-03 23:54 US/Pacific has begun dropping requests. The issue with Cloud Pubsub appears to be affecting us-east1. We will provide more information by Wednesday, 2018-07-04 00:50 US/Pacific.", "when": "2018-07-04T07:05:15Z"}, {"created": "2018-07-04T07:05:13Z", "modified": "2018-07-04T07:05:13Z", "text": "Google Cloud Pub/Sub as of Tuesday, 2018-07-03 23:54 US/Pacific has begun dropping requests. The issue with Cloud Pubsub appears to be affecting us-east1.", "when": "2018-07-04T07:05:13Z"}], "uri": "/incident/cloud-pubsub/18003"}, {"begin": "2018-06-27T21:21:35Z", "created": "2018-06-27T21:45:36Z", "end": "2018-06-27T22:11:18Z", "external_desc": "Issue affecting the ability to list projects and organizations", "modified": "2018-07-02T16:20:09Z", "most-recent-update": {"created": "2018-07-02T16:20:09Z", "modified": "2018-07-02T16:20:09Z", "text": "ISSUE SUMMARY\r\n\r\nOn Wednesday 27 June 2018, Google Cloud Console and Google Cloud Resource Manager API experienced a significant performance degradation for a duration of 51 minutes. Although, this did not affect user resources running on the Google Cloud Platform, we understand that our customers rely heavily on Google Cloud Console to manage their resources and we sincerely apologize to everyone who was affected by the incident. \r\n\r\n\r\nDETAILED DESCRIPTION OF IMPACT\r\n\r\nFrom 14:21 to 15:11 PDT, users were unable to manage their folders, projects and organizations using Google Cloud Console, Google Cloud Resource Manager API and gcloud command line. The following APIs were affected:\r\n\r\nGoogle Cloud Console: Impacted users were unable to list their projects, search for projects, folders and organizations or view their bill. Search box failed to return the above too.\r\nGoogle Cloud Resource Manager API: Impacted users were unable to list their projects, folders and organizations\r\nBigQuery: Impacted users were unable to list their bigquery projects using the API.\r\n\r\n\r\nROOT CAUSE\r\n\r\nThe incident was triggered by a configuration change in the search infrastructure powering Cloud resource metadata search. The search infrastructure sends ACL checks to a central ACL server to make sure the end user has access to the Cloud resource metadata it plans to return. The configuration change introduced a new field in the ACL check request, while the central ACL server had not whitelisted the search infrastructure to send that field, causing an outage in Cloud resource metadata search.\r\n\r\n\r\nREMEDIATION AND PREVENTION\r\n\r\nAt 12:26 PDT, Google Engineers rolled out the configuration change. Our automated release validation system rejected the change due to a high rate of errors. Around 14:16 PDT, an unrelated change was made to the same search infrastructure which triggered a bug that disabled its automated release validation system. This change also inadvertently picked up the prior configuration change and due to the lack of automated release validation, the change was successfully propagated to production.  Within a span of few minutes, several engineering teams were automatically alerted to the situation and began the mitigation process. \r\n\r\nThe issue was fully mitigated at 15:11 PDT when the configuration change was rolled back. \r\n\r\nWe apologize again for the inconvenience caused. We are taking immediate steps to prevent recurrence and improve reliability in the future, including: \r\n\r\nFixing the bug that inadvertently disabled the canary analysis system.\r\nImproving process around pushing changes that involve several dependencies.\r\nImproving testing and staging alerts to catch issues of this nature before they reach production.", "when": "2018-07-02T16:20:09Z"}, "number": 18001, "public": true, "service_key": "cloud-iam", "service_name": "Identity & Security", "severity": "medium", "updates": [{"created": "2018-07-02T16:20:09Z", "modified": "2018-07-02T16:20:09Z", "text": "ISSUE SUMMARY\r\n\r\nOn Wednesday 27 June 2018, Google Cloud Console and Google Cloud Resource Manager API experienced a significant performance degradation for a duration of 51 minutes. Although, this did not affect user resources running on the Google Cloud Platform, we understand that our customers rely heavily on Google Cloud Console to manage their resources and we sincerely apologize to everyone who was affected by the incident. \r\n\r\n\r\nDETAILED DESCRIPTION OF IMPACT\r\n\r\nFrom 14:21 to 15:11 PDT, users were unable to manage their folders, projects and organizations using Google Cloud Console, Google Cloud Resource Manager API and gcloud command line. The following APIs were affected:\r\n\r\nGoogle Cloud Console: Impacted users were unable to list their projects, search for projects, folders and organizations or view their bill. Search box failed to return the above too.\r\nGoogle Cloud Resource Manager API: Impacted users were unable to list their projects, folders and organizations\r\nBigQuery: Impacted users were unable to list their bigquery projects using the API.\r\n\r\n\r\nROOT CAUSE\r\n\r\nThe incident was triggered by a configuration change in the search infrastructure powering Cloud resource metadata search. The search infrastructure sends ACL checks to a central ACL server to make sure the end user has access to the Cloud resource metadata it plans to return. The configuration change introduced a new field in the ACL check request, while the central ACL server had not whitelisted the search infrastructure to send that field, causing an outage in Cloud resource metadata search.\r\n\r\n\r\nREMEDIATION AND PREVENTION\r\n\r\nAt 12:26 PDT, Google Engineers rolled out the configuration change. Our automated release validation system rejected the change due to a high rate of errors. Around 14:16 PDT, an unrelated change was made to the same search infrastructure which triggered a bug that disabled its automated release validation system. This change also inadvertently picked up the prior configuration change and due to the lack of automated release validation, the change was successfully propagated to production.  Within a span of few minutes, several engineering teams were automatically alerted to the situation and began the mitigation process. \r\n\r\nThe issue was fully mitigated at 15:11 PDT when the configuration change was rolled back. \r\n\r\nWe apologize again for the inconvenience caused. We are taking immediate steps to prevent recurrence and improve reliability in the future, including: \r\n\r\nFixing the bug that inadvertently disabled the canary analysis system.\r\nImproving process around pushing changes that involve several dependencies.\r\nImproving testing and staging alerts to catch issues of this nature before they reach production.", "when": "2018-07-02T16:20:09Z"}, {"created": "2018-06-27T22:23:31Z", "modified": "2018-06-27T22:23:31Z", "text": "The issue impacting the ability to list projects and organizations has been resolved for all affected users as of 15:16 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence. We will provide a more detailed analysis of this incident once we have completed our internal investigation.", "when": "2018-06-27T22:23:31Z"}, {"created": "2018-06-27T22:13:52Z", "modified": "2018-06-27T22:13:52Z", "text": "We are experiencing an issue impacting the ability of customers to list projects and organzations beginning at Wednesday, 2018-06-27 14:34 US/Pacific. Current data indicate(s) that approximately 75% of all GCP customers are affected by this issue. For everyone who is affected, we apologize for the disruption. We will provide an update by Wednesday, 2018-06-27 16:13 US/Pacific with current details.", "when": "2018-06-27T22:13:52Z"}, {"created": "2018-06-27T21:45:37Z", "modified": "2018-06-27T21:45:37Z", "text": "We are investigating an issue with Identity & Access Management. We will provide more information by Wednesday, 2018-06-27 15:00 US/Pacific.", "when": "2018-06-27T21:45:37Z"}], "uri": "/incident/cloud-iam/18001"}, {"begin": "2018-06-23T19:33:04Z", "created": "2018-06-23T19:33:04Z", "end": "2018-06-23T20:18:11Z", "external_desc": "We've received a report of an issue with Google Cloud Networking in us-east1.", "modified": "2018-06-23T20:18:15Z", "most-recent-update": {"created": "2018-06-23T20:18:14Z", "modified": "2018-06-23T20:18:14Z", "text": "The issue with Google Cloud Networking in us-east1 has been resolved for all affected projects as of Saturday, 2018-06-23 13:16 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-06-23T20:18:14Z"}, "number": 18006, "public": true, "service_key": "compute", "service_name": "Google Compute Engine", "severity": "medium", "updates": [{"created": "2018-06-23T20:18:14Z", "modified": "2018-06-23T20:18:14Z", "text": "The issue with Google Cloud Networking in us-east1 has been resolved for all affected projects as of Saturday, 2018-06-23 13:16 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-06-23T20:18:14Z"}, {"created": "2018-06-23T19:33:07Z", "modified": "2018-06-23T19:33:07Z", "text": "We are experiencing an issue with Google Cloud Networking beginning on Saturday, 2018-06-23 12:02 US/Pacific. Current data that approximately 33% of projects in us-east1 are affected by this issue. For everyone who is affected, we apologize for the disruption. We will provide an update by Saturday, 2018-06-23 14:00 US/Pacific with current details.", "when": "2018-06-23T19:33:07Z"}, {"created": "2018-06-23T19:33:05Z", "modified": "2018-06-23T19:33:05Z", "text": "We've received a report of an issue with Google Cloud Networking in us-east1.", "when": "2018-06-23T19:33:05Z"}], "uri": "/incident/compute/18006"}, {"begin": "2018-06-23T00:40:00Z", "created": "2018-06-23T01:48:34Z", "end": "2018-06-23T00:50:00Z", "external_desc": "Google App Engine Serving 404 Errors in us-central1", "modified": "2018-06-23T01:48:34Z", "most-recent-update": {"created": "2018-06-23T01:48:34Z", "modified": "2018-06-23T01:48:34Z", "text": "During the period of 5:40 PM PDT and 5:50 PM PDT we detected 20% of Google App Engine traffic within us-central1 to be serving 404 errors.\r\n\r\nThis issue has been resolved for all affected projects. We will conduct\r\nan internal investigation of this issue and make appropriate improvements to our\r\nsystems to help prevent or minimize future recurrence.", "when": "2018-06-23T01:48:34Z"}, "number": 18004, "public": true, "service_key": "appengine", "service_name": "Google App Engine", "severity": "medium", "updates": [{"created": "2018-06-23T01:48:34Z", "modified": "2018-06-23T01:48:34Z", "text": "During the period of 5:40 PM PDT and 5:50 PM PDT we detected 20% of Google App Engine traffic within us-central1 to be serving 404 errors.\r\n\r\nThis issue has been resolved for all affected projects. We will conduct\r\nan internal investigation of this issue and make appropriate improvements to our\r\nsystems to help prevent or minimize future recurrence.", "when": "2018-06-23T01:48:34Z"}], "uri": "/incident/appengine/18004"}, {"begin": "2018-06-22T19:06:58Z", "created": "2018-06-22T19:50:58Z", "end": "2018-06-22T20:12:38Z", "external_desc": "We've received a report of an issue with Google BigQuery.", "modified": "2018-06-27T16:22:07Z", "most-recent-update": {"created": "2018-06-27T16:22:07Z", "modified": "2018-06-27T16:22:07Z", "text": "ISSUE SUMMARY\r\n\r\nOn Friday 22 June 2018, Google BigQuery experienced increased query failures for a duration of 1 hour 6 minutes. We apologize for the impact of this issue on our customers and are making changes to mitigate and prevent a recurrence. \r\n\r\n\r\nDETAILED DESCRIPTION OF IMPACT\r\n\r\nOn Friday 22 June 2018 from 12:06 to 13:12 PDT, up to 50% of total requests to the BigQuery API failed with error code 503. Error rates varied during the incident, with some customers experiencing 100% failure rate for their BigQuery table jobs. bigquery.tabledata.insertAll jobs were unaffected. \r\n\r\nROOT CAUSE\r\n\r\nA new release of the BigQuery API introduced a software defect that caused the API component to return larger-than-normal responses to the BigQuery router server. The router server is responsible for examining each request, routing it to a backend server, and returning the response to the client. To process these large responses, the router server allocated more memory which led to an increase in garbage collection. This resulted in an increase in CPU utilization, which caused our automated load balancing system to shrink the server capacity as a safeguard against abuse. With the reduced capacity and now comparatively large throughput of requests, the denial of service protection system used by BigQuery responded by rejecting user requests, causing a high rate of 503 errors. \r\n\r\nREMEDIATION AND PREVENTION\r\n\r\nGoogle Engineers initially mitigated the issue by increasing the capacity of the BigQuery router server which prevented overload and allowed API traffic to resume normally. The issue was fully resolved by identifying and reverting the change that caused large response sizes.\r\n\r\nTo prevent future occurrences, BigQuery engineers will also be adjusting capacity alerts to improve monitoring of server overutilization. \r\n\r\nWe apologize once again for the impact of this incident on your business.", "when": "2018-06-27T16:22:07Z"}, "number": 18037, "public": true, "service_key": "bigquery", "service_name": "Google BigQuery", "severity": "high", "updates": [{"created": "2018-06-27T16:22:07Z", "modified": "2018-06-27T16:22:07Z", "text": "ISSUE SUMMARY\r\n\r\nOn Friday 22 June 2018, Google BigQuery experienced increased query failures for a duration of 1 hour 6 minutes. We apologize for the impact of this issue on our customers and are making changes to mitigate and prevent a recurrence. \r\n\r\n\r\nDETAILED DESCRIPTION OF IMPACT\r\n\r\nOn Friday 22 June 2018 from 12:06 to 13:12 PDT, up to 50% of total requests to the BigQuery API failed with error code 503. Error rates varied during the incident, with some customers experiencing 100% failure rate for their BigQuery table jobs. bigquery.tabledata.insertAll jobs were unaffected. \r\n\r\nROOT CAUSE\r\n\r\nA new release of the BigQuery API introduced a software defect that caused the API component to return larger-than-normal responses to the BigQuery router server. The router server is responsible for examining each request, routing it to a backend server, and returning the response to the client. To process these large responses, the router server allocated more memory which led to an increase in garbage collection. This resulted in an increase in CPU utilization, which caused our automated load balancing system to shrink the server capacity as a safeguard against abuse. With the reduced capacity and now comparatively large throughput of requests, the denial of service protection system used by BigQuery responded by rejecting user requests, causing a high rate of 503 errors. \r\n\r\nREMEDIATION AND PREVENTION\r\n\r\nGoogle Engineers initially mitigated the issue by increasing the capacity of the BigQuery router server which prevented overload and allowed API traffic to resume normally. The issue was fully resolved by identifying and reverting the change that caused large response sizes.\r\n\r\nTo prevent future occurrences, BigQuery engineers will also be adjusting capacity alerts to improve monitoring of server overutilization. \r\n\r\nWe apologize once again for the impact of this incident on your business.", "when": "2018-06-27T16:22:07Z"}, {"created": "2018-06-22T20:32:40Z", "modified": "2018-06-22T20:32:40Z", "text": "The issue with Google BigQuery has been resolved for all affected projects as of Friday, 2018-06-22 13:30 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.  We will provide a more detailed analysis of this incident once we have completed our internal investigation.", "when": "2018-06-22T20:32:40Z"}, {"created": "2018-06-22T20:15:45Z", "modified": "2018-06-22T20:15:45Z", "text": "Mitigation work is currently underway by our Engineering Team. We will provide another status update by Friday, 2018-06-22 14:15 US/Pacific with current details.", "when": "2018-06-22T20:15:45Z"}, {"created": "2018-06-22T19:51:07Z", "modified": "2018-06-22T19:51:07Z", "text": "We are investigating an issue with Google BiqQuery. Our Engineering Team is investigating possible causes. Affected customers may see their queries fail with 500 errors.\r\nWe will provide another status update by Friday, 2018-06-22 14:00 US/Pacific with current details.", "when": "2018-06-22T19:51:07Z"}, {"created": "2018-06-22T19:51:04Z", "modified": "2018-06-22T19:51:04Z", "text": "We've received a report of an issue with Google BigQuery.", "when": "2018-06-22T19:51:04Z"}], "uri": "/incident/bigquery/18037"}, {"begin": "2018-06-15T22:27:21Z", "created": "2018-06-15T22:27:23Z", "end": "2018-06-16T20:13:00Z", "external_desc": "Google Compute Engine VM instances allocated with duplicate internal IP addresses,  stopped instances networking are not coming up when started.", "modified": "2018-06-16T20:13:08Z", "most-recent-update": {"created": "2018-06-16T20:13:08Z", "modified": "2018-06-16T20:13:08Z", "text": "The issue with Google Compute Engine VM instances being allocated duplicate external IP addresses has been resolved for all affected projects as of Saturday, 2018-06-16 12:59 US/Pacific. Customers with VMs having duplicate internal IP addresses should follow the workaround described earlier, which is to delete (without deleting the boot disk), and recreate the affected VM instances. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-06-16T20:13:08Z"}, "number": 18005, "public": true, "service_key": "compute", "service_name": "Google Compute Engine", "severity": "medium", "updates": [{"created": "2018-06-16T20:13:08Z", "modified": "2018-06-16T20:13:08Z", "text": "The issue with Google Compute Engine VM instances being allocated duplicate external IP addresses has been resolved for all affected projects as of Saturday, 2018-06-16 12:59 US/Pacific. Customers with VMs having duplicate internal IP addresses should follow the workaround described earlier, which is to delete (without deleting the boot disk), and recreate the affected VM instances. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-06-16T20:13:08Z"}, {"created": "2018-06-16T10:33:32Z", "modified": "2018-06-16T10:33:32Z", "text": "Mitigation work is currently underway by our Engineering Team. GCE VMs that have an internal IP that is not assigned to another VM within the same project, region and network should no longer see this issue occurring, however instances where another VM is using their internal IP may fail to start with networking. We will provide another status update by Saturday, 2018-06-16 15:00 US/Pacific with current details.", "when": "2018-06-16T10:33:32Z"}, {"created": "2018-06-16T03:03:18Z", "modified": "2018-06-16T03:03:18Z", "text": "Detailed Description\r\nWe are investigating an issue with Google Compute Engine VM instances failing to start with networking after being stopped, or new instances being allocated with the same IP address as a VM instance which was stopped within the past 24 hours. Our Engineering Team is evaluating a fix in a test environment. We will provide another status update by Saturday, 2018-06-16 03:30 US/Pacific with current details.\r\n\r\nDiagnosis\r\nInstances that were stopped at any time between 2018-06-14 08:42 and 2018-06-15 13:40 US/Pacific may fail to start with networking. A newly allocated VM instance has the same IP address as a VM instance which was stopped within the mentioned time period.\r\n\r\nWorkaround\r\nAs an immediate mitigation to fix instances for which networking is not working, instances should be recreated, that is a delete (without deleting the boot disk), and a create. e.g:\r\n\r\ngcloud compute instances describe instance-1 --zone us-central1-a\r\n\r\ngcloud compute instances delete instance-1 --zone us-central1-a --keep-disks=all\r\n\r\ngcloud compute instances create instance-1 --zone-us-central1-a --disk='boot=yes,name=instance-1'\r\n\r\nTo prevent new instances from coming up with duplicate IP addresses we suggest creating f1-micros until new ip addresses are allocated, and then stopping the instances to stop incurring charges. Alternatively new instances can be brought up with a static internal ip address.", "when": "2018-06-16T03:03:18Z"}, {"created": "2018-06-16T00:11:15Z", "modified": "2018-06-16T00:11:15Z", "text": "Our Engineering Team continues to evaluate the fix in a test environment. We believe that customers can work around the issue by launching then stopping f1 micro instances until no more duplicate IP addresses are obtained. We are awaiting confirmation that the provided workaround works for customers. We will provide another status update by Friday, 2018-06-15 20:00 US/Pacific with current details.", "when": "2018-06-16T00:11:15Z"}, {"created": "2018-06-15T23:05:06Z", "modified": "2018-06-15T23:05:06Z", "text": "Our Engineering Team is evaluating a fix in a test environment. We will provide another status update by Friday, 2018-06-15 17:30 US/Pacific with current details.", "when": "2018-06-15T23:05:05Z"}, {"created": "2018-06-15T22:27:28Z", "modified": "2018-06-15T22:27:28Z", "text": "Investigation continues by our Engineering Team. We are investigating workarounds as well as a method to resolve the issue for all affected projects. We will provide another status update by Friday, 2018-06-15 17:30 US/Pacific with current details.", "when": "2018-06-15T22:27:28Z"}, {"created": "2018-06-15T22:27:24Z", "modified": "2018-06-15T22:27:24Z", "text": "Google Compute Engine VM instances allocated with duplicate internal IP addresses", "when": "2018-06-15T22:27:24Z"}], "uri": "/incident/compute/18005"}, {"begin": "2018-06-12T20:00:00Z", "created": "2018-06-12T21:10:43Z", "end": "2018-06-12T23:30:00Z", "external_desc": "Customers unable to see instances in the console", "modified": "2018-06-12T23:36:00Z", "most-recent-update": {"created": "2018-06-12T23:36:00Z", "modified": "2018-06-12T23:36:00Z", "text": "The issue with the Google Cloud Console not displaying Google Compute Engine Instances has been resolved for all affected users as of 16:30 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-06-12T23:30:00Z"}, "number": 18004, "public": true, "service_key": "developers-console", "service_name": "Google Cloud Console", "severity": "medium", "updates": [{"created": "2018-06-12T23:36:00Z", "modified": "2018-06-12T23:36:00Z", "text": "The issue with the Google Cloud Console not displaying Google Compute Engine Instances has been resolved for all affected users as of 16:30 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-06-12T23:30:00Z"}, {"created": "2018-06-12T22:56:17Z", "modified": "2018-06-12T22:56:17Z", "text": "Our Engineering Team believes they have identified the root cause of the issue with Google Cloud Console failing to list Google Compute Engine instances and is working to mitigate. We will provide another status update by 17:30 US/Pacific with current details.", "when": "2018-06-12T22:56:17Z"}, {"created": "2018-06-12T22:03:46Z", "modified": "2018-06-12T22:03:46Z", "text": "We are investigating an issue with the Google Cloud Console not displaying Google Compute Engine Instances. Our Engineering Team is investigating possible causes. We will provide another status update by 16:30 US/Pacific with current details.", "when": "2018-06-12T22:00:00Z"}, {"created": "2018-06-12T21:10:43Z", "modified": "2018-06-12T21:12:52Z", "text": "Description:\r\nWe are investigating an issue with Google Cloud Console. We will provide more information\r\nby 15:00 US/Pacific.\r\n\r\nHow to diagnose:\r\nYou may be affected by this issue if you navigate to list Google Compute Engine instances in the Google Cloud Console and instead see an empty list.\r\n\r\n\r\nWorkaround:\r\nUsing gcloud to list and interact with your instances is a workaround.\r\n\r\nfor example: gcloud compute instances list; gcloud compute ssh instance-1", "when": "2018-06-12T21:10:43Z"}], "uri": "/incident/developers-console/18004"}, {"begin": "2018-06-05T04:24:27Z", "created": "2018-06-05T04:24:28Z", "end": "2018-06-05T05:33:51Z", "external_desc": "The issue with Google Networking in South America should be resolved.", "modified": "2018-06-05T05:33:58Z", "most-recent-update": {"created": "2018-06-05T05:33:58Z", "modified": "2018-06-05T05:33:58Z", "text": "The issue with Google Cloud Networking in South America has been resolved for all affected users as of Monday, 2018-06-04 22:22. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-06-05T05:33:58Z"}, "number": 18011, "public": true, "service_key": "cloud-networking", "service_name": "Google Cloud Networking", "severity": "medium", "updates": [{"created": "2018-06-05T05:33:58Z", "modified": "2018-06-05T05:33:58Z", "text": "The issue with Google Cloud Networking in South America has been resolved for all affected users as of Monday, 2018-06-04 22:22. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-06-05T05:33:58Z"}, {"created": "2018-06-05T05:17:24Z", "modified": "2018-06-05T05:17:24Z", "text": "The issue with Google Networking in South America should be resolved for some of users and we expect a full resolution in the near future. We will provide another status update by Monday, 2018-06-04 23:15 US/Pacific with current details.", "when": "2018-06-05T05:17:24Z"}, {"created": "2018-06-05T04:57:42Z", "modified": "2018-06-05T04:57:42Z", "text": "The rate of errors is decreasing. We will provide another status update by Monday, 2018-06-04 23:00 US/Pacific.", "when": "2018-06-05T04:57:42Z"}, {"created": "2018-06-05T04:32:11Z", "modified": "2018-06-05T04:32:11Z", "text": "Mitigation work is currently underway by our Engineering Team for Google Cloud Networking issue affecting South America. We will provide more information by Monday, 2018-06-04 22:45 US/Pacific.", "when": "2018-06-05T04:32:11Z"}, {"created": "2018-06-05T04:24:33Z", "modified": "2018-06-05T04:24:33Z", "text": "We are investigating loss for traffic to and from South America. Our Engineering Team is investigating possible causes. We will provide another status update by Monday, 2018-06-04 22:30 US/Pacific.", "when": "2018-06-05T04:24:33Z"}, {"created": "2018-06-05T04:24:29Z", "modified": "2018-06-05T04:24:29Z", "text": "We are investigating loss of traffic with Google Networking in South America.", "when": "2018-06-05T04:24:29Z"}], "uri": "/incident/cloud-networking/18011"}, {"begin": "2018-05-22T15:40:17Z", "created": "2018-05-22T20:10:18Z", "end": "2018-05-22T21:29:06Z", "external_desc": "Issue with Google Cloud project creation", "modified": "2018-05-22T21:29:08Z", "most-recent-update": {"created": "2018-05-22T21:29:08Z", "modified": "2018-05-22T21:29:08Z", "text": "The issue with project creation has been resolved for all affected projects as of Tuesday, 2018-05-22 13:00 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-05-22T21:29:08Z"}, "number": 18003, "public": true, "service_key": "developers-console", "service_name": "Google Cloud Console", "severity": "medium", "updates": [{"created": "2018-05-22T21:29:08Z", "modified": "2018-05-22T21:29:08Z", "text": "The issue with project creation has been resolved for all affected projects as of Tuesday, 2018-05-22 13:00 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-05-22T21:29:08Z"}, {"created": "2018-05-22T20:10:23Z", "modified": "2018-05-22T20:10:23Z", "text": "The rate of errors is decreasing. We will provide another status update by\r\nTuesday, 2018-05-22 14:10 US/Pacific with current details.", "when": "2018-05-22T20:10:23Z"}, {"created": "2018-05-22T20:19:12Z", "modified": "2018-05-22T20:44:27Z", "text": "We are experiencing an issue with creating new projects as well as activating some APIs for projects. beginning at Tuesday, 2018-05-22 07:50 US/Pacific.\r\n\r\nFor everyone who is affected, we apologize for the disruption. We will provide an update by Tuesday, 2018-05-22 13:40 US/Pacific with current details.", "when": "2018-05-22T19:03:12Z"}, {"created": "2018-05-22T20:10:20Z", "modified": "2018-05-22T20:45:10Z", "text": "Issue with Google Cloud project creation", "when": "2018-05-22T15:40:20Z"}], "uri": "/incident/developers-console/18003"}, {"begin": "2018-05-21T01:40:51Z", "created": "2018-05-21T03:19:57Z", "end": "2018-05-21T06:05:24Z", "external_desc": "The Stackdriver logging service is experiencing a 30-minute delay.", "modified": "2018-05-24T20:03:52Z", "most-recent-update": {"created": "2018-05-24T20:03:52Z", "modified": "2018-05-24T20:03:52Z", "text": "ISSUE SUMMARY\r\n\r\nOn Sunday, 20 May 2018 for 4 hours and 25 minutes, approximately 6% of Stackdriver Logging logs experienced a median ingest latency of 90 minutes. To our Stackdriver Logging customers whose operations monitoring was impacted during this outage, we apologize. We have conducted an internal investigation and are taking steps to ensure this doesn\u2019t happen again.\r\n\r\nDETAILED DESCRIPTION OF IMPACT\r\n\r\nOn Wednesday, 20 May 2018 from 18:40 to 23:05 Pacific Time, 6% of logs ingested by Stackdriver Logging experienced log event ingest latency of up to 2 hours 30 minutes, with a median latency of 90 minutes. Customers requesting log events within the latency window would receive empty responses. Logging export sinks were not affected.\r\n\r\nROOT CAUSE\r\n\r\nStackdriver Logging uses a pool of workers to persist ingested log events. On Wednesday, 20 May 2018 at 17:40, a load spike in the Stackdriver Logging storage subsystem caused 0.05% of persist calls made by the workers to time out. The workers would then retry persisting to the same address until reaching a retry timeout. While the workers were retrying, they were not persisting other log events. This resulted in multiple workers removed from the pool of available workers.\r\n\r\nBy 18:40, enough workers had been removed from the pool to reduce throughput below the level of incoming traffic, creating delays for 6% of logs.\r\n\r\nREMEDIATION AND PREVENTION\r\n\r\nAfter Google Engineering was paged, engineers isolated the issue to these timing out workers. At 20:35, engineers configured the workers to return timed out log events to queue and move on to a different log event after timeout. This allowed workers to catch up with ingest rate. At 23:02, the last delayed message was delivered.\r\n\r\nWe are taking the following steps to prevent the issue from happening again: we are modifying the workers to retry persists using alternate addresses to reduce the impact of persist timeouts; we are increasing the persist capacity of the storage subsystem to manage load spikes; we are modifying Stackdriver Logging workers to reduce their unavailability when the storage subsystem experiences higher latency.", "when": "2018-05-24T20:03:52Z"}, "number": 18007, "public": true, "service_key": "google-stackdriver", "service_name": "Google Stackdriver", "severity": "high", "updates": [{"created": "2018-05-24T20:03:52Z", "modified": "2018-05-24T20:03:52Z", "text": "ISSUE SUMMARY\r\n\r\nOn Sunday, 20 May 2018 for 4 hours and 25 minutes, approximately 6% of Stackdriver Logging logs experienced a median ingest latency of 90 minutes. To our Stackdriver Logging customers whose operations monitoring was impacted during this outage, we apologize. We have conducted an internal investigation and are taking steps to ensure this doesn\u2019t happen again.\r\n\r\nDETAILED DESCRIPTION OF IMPACT\r\n\r\nOn Wednesday, 20 May 2018 from 18:40 to 23:05 Pacific Time, 6% of logs ingested by Stackdriver Logging experienced log event ingest latency of up to 2 hours 30 minutes, with a median latency of 90 minutes. Customers requesting log events within the latency window would receive empty responses. Logging export sinks were not affected.\r\n\r\nROOT CAUSE\r\n\r\nStackdriver Logging uses a pool of workers to persist ingested log events. On Wednesday, 20 May 2018 at 17:40, a load spike in the Stackdriver Logging storage subsystem caused 0.05% of persist calls made by the workers to time out. The workers would then retry persisting to the same address until reaching a retry timeout. While the workers were retrying, they were not persisting other log events. This resulted in multiple workers removed from the pool of available workers.\r\n\r\nBy 18:40, enough workers had been removed from the pool to reduce throughput below the level of incoming traffic, creating delays for 6% of logs.\r\n\r\nREMEDIATION AND PREVENTION\r\n\r\nAfter Google Engineering was paged, engineers isolated the issue to these timing out workers. At 20:35, engineers configured the workers to return timed out log events to queue and move on to a different log event after timeout. This allowed workers to catch up with ingest rate. At 23:02, the last delayed message was delivered.\r\n\r\nWe are taking the following steps to prevent the issue from happening again: we are modifying the workers to retry persists using alternate addresses to reduce the impact of persist timeouts; we are increasing the persist capacity of the storage subsystem to manage load spikes; we are modifying Stackdriver Logging workers to reduce their unavailability when the storage subsystem experiences higher latency.", "when": "2018-05-24T20:03:52Z"}, {"created": "2018-05-21T05:53:26Z", "modified": "2018-05-21T05:53:26Z", "text": "The issue with StackDriver logging delay has been resolved for all affected projects as of Sunday, 2018-05-20 22:45 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence. We will provide a more detailed analysis of this incident once we have completed our internal investigation.", "when": "2018-05-21T05:53:26Z"}, {"created": "2018-05-21T05:04:36Z", "modified": "2018-05-21T05:04:36Z", "text": "Mitigation work is currently underway by our Engineering Team. We will provide another status update by Sunday, 2018-05-20 23:00 US/Pacific with current details.", "when": "2018-05-21T05:04:36Z"}, {"created": "2018-05-21T03:44:55Z", "modified": "2018-05-21T03:44:55Z", "text": "The Stackdriver logging service is experiencing a 30-minute delay. We will provide another status update by Sunday, 2018-05-20 22:00 US/Pacific with current details.", "when": "2018-05-21T03:44:55Z"}, {"created": "2018-05-21T03:19:59Z", "modified": "2018-05-21T03:19:59Z", "text": "We are investigating an issue with Google Stackdriver. We will provide more information by Sunday, 2018-05-20 20:30 US/Pacific.", "when": "2018-05-21T03:19:59Z"}], "uri": "/incident/google-stackdriver/18007"}, {"begin": "2018-05-19T04:39:59Z", "created": "2018-05-19T04:40:00Z", "end": "2018-05-19T20:21:52Z", "external_desc": "We are investigating an issue with Google Cloud Networking in us-east4.", "modified": "2018-05-19T20:21:55Z", "most-recent-update": {"created": "2018-05-19T20:21:55Z", "modified": "2018-05-19T20:21:55Z", "text": "The issue with external IP allocation in us-east4 has been resolved as of Saturday, 2018-05-19 11:30 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-05-19T20:21:55Z"}, "number": 18010, "public": true, "service_key": "cloud-networking", "service_name": "Google Cloud Networking", "severity": "high", "updates": [{"created": "2018-05-19T20:21:55Z", "modified": "2018-05-19T20:21:55Z", "text": "The issue with external IP allocation in us-east4 has been resolved as of Saturday, 2018-05-19 11:30 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-05-19T20:21:55Z"}, {"created": "2018-05-19T12:58:07Z", "modified": "2018-05-19T12:58:07Z", "text": "Allocation of new external IP addresses for GCE instance creation continues to be unavailable in us-east4. For everyone who is affected, we apologize for the disruption. We will provide an update by Saturday, 2018-05-19 18:00 US/Pacific with current details.", "when": "2018-05-19T12:58:07Z"}, {"created": "2018-05-19T05:57:07Z", "modified": "2018-05-19T05:57:07Z", "text": "We are experiencing an issue with Google Cloud Networking and Google Compute Engine in us-east4 that prevents the creation of GCE instances that require allocation of new external IP addresses. For everyone who is affected, we apologize for the disruption. We will provide an update by Saturday, 2018-05-19 06:00 US/Pacific with current details.", "when": "2018-05-19T05:57:07Z"}, {"created": "2018-05-19T04:51:24Z", "modified": "2018-05-19T04:51:24Z", "text": "We are experiencing an issue with Google Cloud Networking and Google Compute Engine in us-east4 that prevents the creation of GCE instances with external IP addresses attached. Early investigation indicates that all instances in us-east4 are affected by this issue. For everyone who is affected, we apologize for the disruption. We will provide an update by Friday, 2018-05-18 23:00 US/Pacific with current details.", "when": "2018-05-19T04:51:24Z"}, {"created": "2018-05-19T04:40:02Z", "modified": "2018-05-19T04:40:02Z", "text": "We are investigating an issue with Google Cloud Networking in us-east4. We will provide more information by Friday, 2018-05-18 22:00 US/Pacific.", "when": "2018-05-19T04:40:02Z"}], "uri": "/incident/cloud-networking/18010"}, {"begin": "2018-05-17T19:33:42Z", "created": "2018-05-17T19:33:44Z", "end": "2018-05-17T20:18:05Z", "external_desc": "Issue affecting Google Cloud Function customers' ability to create and update functions.", "modified": "2018-05-17T20:18:08Z", "most-recent-update": {"created": "2018-05-17T20:18:08Z", "modified": "2018-05-17T20:18:08Z", "text": "The issue with Google Cloud Functions affecting the ability to create and update functions has been resolved for all affected users as of Thursday, 2018-05-17 13:01 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-05-17T20:18:08Z"}, "number": 18001, "public": true, "service_key": "cloud-functions", "service_name": "Google Cloud Functions", "severity": "medium", "updates": [{"created": "2018-05-17T20:18:08Z", "modified": "2018-05-17T20:18:08Z", "text": "The issue with Google Cloud Functions affecting the ability to create and update functions has been resolved for all affected users as of Thursday, 2018-05-17 13:01 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-05-17T20:18:08Z"}, {"created": "2018-05-17T20:08:45Z", "modified": "2018-05-17T20:08:45Z", "text": "The rate of errors is decreasing. We will provide another status update by Thursday, 2018-05-17 13:30 US/Pacific with current details.", "when": "2018-05-17T20:08:45Z"}, {"created": "2018-05-17T19:33:49Z", "modified": "2018-05-17T19:33:49Z", "text": "Mitigation work is currently underway by our Engineering Team. We will provide another status update by Thursday, 2018-05-17 13:10 US/Pacific with current details.", "when": "2018-05-17T19:33:49Z"}, {"created": "2018-05-17T19:33:45Z", "modified": "2018-05-17T19:33:45Z", "text": "Issue affecting the ability to update and create Cloud Functions.", "when": "2018-05-17T19:33:45Z"}], "uri": "/incident/cloud-functions/18001"}, {"begin": "2018-05-17T02:27:00Z", "created": "2018-05-17T02:27:48Z", "end": "2018-05-17T03:18:12Z", "external_desc": "GCE Networking issue in us-east4 region affecting GCE VMs, GKE, Cloud VPN and Cloud Private Interconnect ", "modified": "2018-05-17T03:18:12Z", "most-recent-update": {"created": "2018-05-17T03:18:12Z", "modified": "2018-05-17T03:18:12Z", "text": "The issue with GCE Networking (affecting GCE, GKE, Cloud VPN and Cloud Private Interconnect) in us-east4 region has been resolved for all affected users as of Wednesday, 2018-05-16 19:40 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence. We will provide a more detailed analysis of this incident once we have completed our internal investigation.", "when": "2018-05-17T03:18:12Z"}, "number": 18004, "public": true, "service_key": "container-engine", "service_name": "Google Kubernetes Engine", "severity": "high", "updates": [{"created": "2018-05-17T03:18:12Z", "modified": "2018-05-17T03:18:12Z", "text": "The issue with GCE Networking (affecting GCE, GKE, Cloud VPN and Cloud Private Interconnect) in us-east4 region has been resolved for all affected users as of Wednesday, 2018-05-16 19:40 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence. We will provide a more detailed analysis of this incident once we have completed our internal investigation.", "when": "2018-05-17T03:18:12Z"}, {"created": "2018-05-17T02:59:14Z", "modified": "2018-05-17T02:59:14Z", "text": "The issue with GCE Networking (affecting GCE, GKE, Cloud VPN and Cloud Private Interconnect) in us-east4 region should be resolved for the majority of users and we expect a full resolution in the near future. We will provide another status update by Wednesday, 2018-05-16 20:20 US/Pacific with current details.", "when": "2018-05-17T02:59:14Z"}, {"created": "2018-05-17T02:38:27Z", "modified": "2018-05-17T02:38:27Z", "text": "Mitigation work is currently underway by our Engineering Team. We will provide another status update by Wednesday, 2018-05-16 20:10 US/Pacific with current details.", "when": "2018-05-17T02:38:27Z"}, {"created": "2018-05-17T02:27:49Z", "modified": "2018-05-17T02:27:49Z", "text": "We are investigating an issue with GCE Networking in us-east4 region affecting GCE VMs, GKE, Cloud VPN and Cloud Private Interconnect resulting in network packet loss. We will provide more information by Wednesday, 2018-05-16 19:43 US/Pacific.", "when": "2018-05-17T02:27:49Z"}], "uri": "/incident/container-engine/18004"}, {"begin": "2018-05-17T02:24:00Z", "created": "2018-05-17T02:24:32Z", "end": "2018-05-17T03:17:52Z", "external_desc": "GCE Networking issue in us-east4", "modified": "2018-05-22T20:34:52Z", "most-recent-update": {"created": "2018-05-22T20:32:52Z", "modified": "2018-05-22T20:34:52Z", "text": "ISSUE SUMMARY\r\n\r\nOn Wednesday 16 May 2018, Google Cloud Networking experienced loss of connectivity to external IP addresses located in us-east4 for a duration of 58 minutes.\r\n\r\nDETAILED DESCRIPTION OF IMPACT\r\n\r\nOn Wednesday 16 May 2018 from 18:43 to 19:41 PDT, Google Compute Engine, Google Cloud VPN, and Google Cloud Network Load Balancers hosted in the us-east4 region experienced 100% packet loss from the internet and other GCP regions. Google Dedicated Interconnect Attachments located in us-east4 also experienced loss of connectivity.\r\n\r\nROOT CAUSE\r\n\r\nEvery zone in Google Cloud Platform advertises several sets of IP addresses to the Internet via BGP.  Some of these IP addresses are global and are advertised from every zone, others are regional and advertised only from zones in the region. The software that controls the advertisement of these IP addresses contained a race condition during application startup that would cause regional IP addresses to be filtered out and withdrawn from a zone. During a routine binary rollout of this software, the race condition was triggered in each of the three zones in the us-east4 region. Traffic continued to be routed until the last zone received the rollout and stopped advertising regional prefixes. Once the last zone stopped advertising the regional IP addresses, external regional traffic stopped entering us-east4.\r\n\r\nREMEDIATION AND PREVENTION\r\n\r\nGoogle engineers were alerted to the problem within one minute and as soon as investigation pointed to a problem with the BGP advertisements, a rollback of the binary in the us-east4 region was created to mitigate the issue. Once the rollback proved effective, the original rollout was paused globally to prevent any further issues. \r\n\r\nWe are taking the following steps to prevent the issue from happening again. We are adding additional monitoring which will provide better context in future alerts to allow us to diagnose issues faster. We also plan on improving the debuggability of the software that controls the BGP advertisements. Additionally, we will be reviewing the rollout policy for these types of software changes so we can detect issues before they impact an entire region.\r\n\r\nWe apologize for this incident and we recognize that regional outages like this should be rare and quickly rectified. We are taking immediate steps to prevent recurrence and improve reliability in the future.", "when": "2018-05-22T20:32:52Z"}, "number": 18009, "public": true, "service_key": "cloud-networking", "service_name": "Google Cloud Networking", "severity": "high", "updates": [{"created": "2018-05-22T20:32:52Z", "modified": "2018-05-22T20:34:52Z", "text": "ISSUE SUMMARY\r\n\r\nOn Wednesday 16 May 2018, Google Cloud Networking experienced loss of connectivity to external IP addresses located in us-east4 for a duration of 58 minutes.\r\n\r\nDETAILED DESCRIPTION OF IMPACT\r\n\r\nOn Wednesday 16 May 2018 from 18:43 to 19:41 PDT, Google Compute Engine, Google Cloud VPN, and Google Cloud Network Load Balancers hosted in the us-east4 region experienced 100% packet loss from the internet and other GCP regions. Google Dedicated Interconnect Attachments located in us-east4 also experienced loss of connectivity.\r\n\r\nROOT CAUSE\r\n\r\nEvery zone in Google Cloud Platform advertises several sets of IP addresses to the Internet via BGP.  Some of these IP addresses are global and are advertised from every zone, others are regional and advertised only from zones in the region. The software that controls the advertisement of these IP addresses contained a race condition during application startup that would cause regional IP addresses to be filtered out and withdrawn from a zone. During a routine binary rollout of this software, the race condition was triggered in each of the three zones in the us-east4 region. Traffic continued to be routed until the last zone received the rollout and stopped advertising regional prefixes. Once the last zone stopped advertising the regional IP addresses, external regional traffic stopped entering us-east4.\r\n\r\nREMEDIATION AND PREVENTION\r\n\r\nGoogle engineers were alerted to the problem within one minute and as soon as investigation pointed to a problem with the BGP advertisements, a rollback of the binary in the us-east4 region was created to mitigate the issue. Once the rollback proved effective, the original rollout was paused globally to prevent any further issues. \r\n\r\nWe are taking the following steps to prevent the issue from happening again. We are adding additional monitoring which will provide better context in future alerts to allow us to diagnose issues faster. We also plan on improving the debuggability of the software that controls the BGP advertisements. Additionally, we will be reviewing the rollout policy for these types of software changes so we can detect issues before they impact an entire region.\r\n\r\nWe apologize for this incident and we recognize that regional outages like this should be rare and quickly rectified. We are taking immediate steps to prevent recurrence and improve reliability in the future.", "when": "2018-05-22T20:32:52Z"}, {"created": "2018-05-17T03:17:52Z", "modified": "2018-05-17T03:17:52Z", "text": "The issue with GCE Networking (affecting GCE, GKE, Cloud VPN and Cloud Private Interconnect) in us-east4 region has been resolved for all affected users as of Wednesday, 2018-05-16 19:40 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence. We will provide a more detailed analysis of this incident once we have completed our internal investigation.", "when": "2018-05-17T03:17:52Z"}, {"created": "2018-05-17T02:59:38Z", "modified": "2018-05-17T02:59:38Z", "text": "The issue with GCE Networking (affecting GCE, GKE, Cloud VPN and Cloud Private Interconnect) in us-east4 region should be resolved for the majority of users and we expect a full resolution in the near future. We will provide another status update by Wednesday, 2018-05-16 20:20 US/Pacific with current details.", "when": "2018-05-17T02:59:38Z"}, {"created": "2018-05-17T02:38:07Z", "modified": "2018-05-17T02:38:07Z", "text": "Mitigation work is currently underway by our Engineering Team. We will provide another status update by Wednesday, 2018-05-16 20:10 US/Pacific with current details.", "when": "2018-05-17T02:38:07Z"}, {"created": "2018-05-17T02:24:32Z", "modified": "2018-05-17T02:24:32Z", "text": "We are investigating an issue with GCE Networking in us-east4 region affecting GCE VMs, GKE, Cloud VPN and Cloud Private Interconnect resulting in network packet loss. We will provide more information by Wednesday, 2018-05-16 19:43 US/Pacific.", "when": "2018-05-17T02:24:32Z"}], "uri": "/incident/cloud-networking/18009"}, {"begin": "2018-05-17T02:11:29Z", "created": "2018-05-17T02:11:33Z", "end": "2018-05-17T03:17:24Z", "external_desc": "The issue with GCE Networking (affecting GCE, GKE, Cloud VPN and Cloud Private Interconnect) in us-east4 region has been resolved for all affected users as of Wednesday, 2018-05-16 19:40 US/Pacific.", "modified": "2018-05-17T03:17:26Z", "most-recent-update": {"created": "2018-05-17T03:17:26Z", "modified": "2018-05-17T03:17:26Z", "text": "The issue with GCE Networking (affecting GCE, GKE, Cloud VPN and Cloud Private Interconnect) in us-east4 region has been resolved for all affected users as of Wednesday, 2018-05-16 19:40 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence. We will provide a more detailed analysis of this incident once we have completed our internal investigation.", "when": "2018-05-17T03:17:26Z"}, "number": 18004, "public": true, "service_key": "compute", "service_name": "Google Compute Engine", "severity": "high", "updates": [{"created": "2018-05-17T03:17:26Z", "modified": "2018-05-17T03:17:26Z", "text": "The issue with GCE Networking (affecting GCE, GKE, Cloud VPN and Cloud Private Interconnect) in us-east4 region has been resolved for all affected users as of Wednesday, 2018-05-16 19:40 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence. We will provide a more detailed analysis of this incident once we have completed our internal investigation.", "when": "2018-05-17T03:17:26Z"}, {"created": "2018-05-17T02:58:51Z", "modified": "2018-05-17T02:58:51Z", "text": "The issue with GCE Networking (affecting GCE, GKE, Cloud VPN and Cloud Private Interconnect) in us-east4 region should be resolved for the majority of users and we expect a full resolution in the near future. We will provide another status update by Wednesday, 2018-05-16 20:20 US/Pacific with current details.", "when": "2018-05-17T02:58:51Z"}, {"created": "2018-05-17T02:35:26Z", "modified": "2018-05-17T02:35:26Z", "text": "Mitigation work is currently underway by our Engineering Team. We will provide another status update by Wednesday, 2018-05-16 20:10 US/Pacific with current details.", "when": "2018-05-17T02:35:26Z"}, {"created": "2018-05-17T02:12:01Z", "modified": "2018-05-17T02:12:01Z", "text": "We are investigating an issue with GCE Networking in us-east4 region affecting GCE VMs, Cloud VPN and Cloud Private Interconnect resulting in network packet loss. We will provide more information by Wednesday, 2018-05-16 19:43 US/Pacific.", "when": "2018-05-17T02:12:01Z"}, {"created": "2018-05-17T02:11:41Z", "modified": "2018-05-17T02:11:41Z", "text": "GCE Networking in us-east4 region affecting GCE VMs, Cloud VPN and Cloud Private Interconnect resulting in network packet loss.", "when": "2018-05-17T02:11:41Z"}], "uri": "/incident/compute/18004"}, {"begin": "2018-05-16T23:00:00Z", "created": "2018-05-17T00:17:10Z", "end": "2018-05-17T01:18:14Z", "external_desc": "Multiple failing BigQuery job types", "modified": "2018-05-18T21:34:17Z", "most-recent-update": {"created": "2018-05-18T21:34:16Z", "modified": "2018-05-18T21:34:16Z", "text": "ISSUE SUMMARY\r\n\r\nOn Wednesday 16 May 2018, Google BigQuery experienced failures of import, export and query jobs for a duration of 88 minutes over two time periods (55 minutes initially, and 33 minutes in the second, which was isolated to the EU). We sincerely apologize to all of our affected customers; this is not the level of reliability we aim to provide in our products. We will be issuing SLA credits to customers who were affected by this incident and we are taking immediate steps to prevent a future recurrence of these failures.\r\n\r\n\r\nDETAILED DESCRIPTION OF IMPACT\r\n\r\nOn Wednesday 16 May 2018 from 16:00 to 16:55 and from to 17:45 to 18:18 PDT, Google BigQuery experienced a failure of some import, export and query jobs. During the first period of impact, there was a 15.26% job failure rate; during the second, which was isolated to the EU, there was a 2.23% error rate. Affected jobs would have failed with INTERNAL_ERROR as the reason.\r\n\r\nROOT CAUSE\r\n\r\nConfiguration changes being rolled out on the evening of the incident were not applied in the intended order. This resulted in an incomplete configuration change becoming live in some zones, subsequently triggering the failure of customer jobs. During the process of rolling back the configuration, another incorrect configuration change was inadvertently applied, causing the second batch of job failures.\r\n\r\n\r\nREMEDIATION AND PREVENTION\r\n\r\nAutomated monitoring alerted engineering teams 15 minutes after the error threshold was met and were able to correlate the errors with the configuration change 3 minutes later. We feel that the configured alert delay is too long and have lowered it to 5 minutes in order to aid in quicker detection.\r\n\r\nDuring the rollback attempt, another bad configuration change was enqueued for automatic rollout and when unblocked, proceeded to roll out, triggering the second round of job failures. To prevent this from happening in the future, we are working to ensure that rollouts are automatically switched to manual mode when engineers are responding to production incidents.\r\n\r\nIn addition, we're switching to a different configuration system which will ensure the consistency of configuration at all stages of the rollout.", "when": "2018-05-18T21:34:16Z"}, "number": 18036, "public": true, "service_key": "bigquery", "service_name": "Google BigQuery", "severity": "high", "updates": [{"created": "2018-05-18T21:34:16Z", "modified": "2018-05-18T21:34:16Z", "text": "ISSUE SUMMARY\r\n\r\nOn Wednesday 16 May 2018, Google BigQuery experienced failures of import, export and query jobs for a duration of 88 minutes over two time periods (55 minutes initially, and 33 minutes in the second, which was isolated to the EU). We sincerely apologize to all of our affected customers; this is not the level of reliability we aim to provide in our products. We will be issuing SLA credits to customers who were affected by this incident and we are taking immediate steps to prevent a future recurrence of these failures.\r\n\r\n\r\nDETAILED DESCRIPTION OF IMPACT\r\n\r\nOn Wednesday 16 May 2018 from 16:00 to 16:55 and from to 17:45 to 18:18 PDT, Google BigQuery experienced a failure of some import, export and query jobs. During the first period of impact, there was a 15.26% job failure rate; during the second, which was isolated to the EU, there was a 2.23% error rate. Affected jobs would have failed with INTERNAL_ERROR as the reason.\r\n\r\nROOT CAUSE\r\n\r\nConfiguration changes being rolled out on the evening of the incident were not applied in the intended order. This resulted in an incomplete configuration change becoming live in some zones, subsequently triggering the failure of customer jobs. During the process of rolling back the configuration, another incorrect configuration change was inadvertently applied, causing the second batch of job failures.\r\n\r\n\r\nREMEDIATION AND PREVENTION\r\n\r\nAutomated monitoring alerted engineering teams 15 minutes after the error threshold was met and were able to correlate the errors with the configuration change 3 minutes later. We feel that the configured alert delay is too long and have lowered it to 5 minutes in order to aid in quicker detection.\r\n\r\nDuring the rollback attempt, another bad configuration change was enqueued for automatic rollout and when unblocked, proceeded to roll out, triggering the second round of job failures. To prevent this from happening in the future, we are working to ensure that rollouts are automatically switched to manual mode when engineers are responding to production incidents.\r\n\r\nIn addition, we're switching to a different configuration system which will ensure the consistency of configuration at all stages of the rollout.", "when": "2018-05-18T21:34:16Z"}, {"created": "2018-05-17T01:01:14Z", "modified": "2018-05-17T01:01:14Z", "text": "The issue with Google BigQuery has been resolved for all affected users as of 2018-05-16 17:06 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.\r\nShort Summary", "when": "2018-05-17T01:01:14Z"}, {"created": "2018-05-17T00:17:10Z", "modified": "2018-05-17T00:17:10Z", "text": "We are rolling back a configuration change to mitigate this issue. We will provide another status update by Wednesday 2018-05-16 17:21 US/Pacific with current details.", "when": "2018-05-17T00:17:10Z"}], "uri": "/incident/bigquery/18036"}, {"begin": "2018-05-07T16:51:46Z", "created": "2018-05-07T16:51:47Z", "end": "2018-05-07T17:07:15Z", "external_desc": "We've received a report of connectivity issues from GCE instances.", "modified": "2018-05-07T17:08:27Z", "most-recent-update": {"created": "2018-05-07T17:07:17Z", "modified": "2018-05-07T17:07:17Z", "text": "The network connectivity issues from GCE instances has been resolved for all affected users as of  2018-05-07 10:00  US/Pacific. We will conduct\r\nan internal investigation of this issue and make appropriate improvements to our\r\nsystems to help prevent or minimize future recurrence.", "when": "2018-05-07T17:07:17Z"}, "number": 18008, "public": true, "service_key": "cloud-networking", "service_name": "Google Cloud Networking", "severity": "medium", "updates": [{"created": "2018-05-07T17:07:17Z", "modified": "2018-05-07T17:07:17Z", "text": "The network connectivity issues from GCE instances has been resolved for all affected users as of  2018-05-07 10:00  US/Pacific. We will conduct\r\nan internal investigation of this issue and make appropriate improvements to our\r\nsystems to help prevent or minimize future recurrence.", "when": "2018-05-07T17:07:17Z"}, {"created": "2018-05-07T16:51:49Z", "modified": "2018-05-07T16:51:49Z", "text": "We are investigating connectivity issues from GCE instances. We will provide more information by 2018-05-07 10:30 US/Pacific.", "when": "2018-05-07T16:51:49Z"}], "uri": "/incident/cloud-networking/18008"}, {"begin": "2018-05-03T18:10:00Z", "created": "2018-05-04T06:50:51Z", "end": "2018-05-03T18:35:00Z", "external_desc": "Google Support Center availability issue", "modified": "2018-05-04T06:50:51Z", "most-recent-update": {"created": "2018-05-04T06:50:51Z", "modified": "2018-05-04T06:50:51Z", "text": "The Google Support Center availability issue has been resolved for all affected users as of 2018-05-03 23:35 US/Pacific, Thursday. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-05-04T06:50:51Z"}, "number": 18001, "public": true, "service_key": "support", "service_name": "Google Cloud Support", "severity": "high", "updates": [{"created": "2018-05-04T06:50:51Z", "modified": "2018-05-04T06:50:51Z", "text": "The Google Support Center availability issue has been resolved for all affected users as of 2018-05-03 23:35 US/Pacific, Thursday. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-05-04T06:50:51Z"}], "uri": "/incident/support/18001"}, {"begin": "2018-05-02T21:02:35Z", "created": "2018-05-02T21:02:37Z", "end": "2018-05-02T21:19:19Z", "external_desc": "We are investigating an issue with increased packet loss in us-central1 with Google Cloud Networking.", "modified": "2018-05-08T15:25:56Z", "most-recent-update": {"created": "2018-05-08T15:24:46Z", "modified": "2018-05-08T15:25:56Z", "text": "ISSUE SUMMARY\r\n\r\nOn Wednesday 2 May, 2018 Google Cloud Networking experienced increased packet loss to the internet as well as other Google regions from the us-central1 region for a duration of 21 minutes. We understand that the network is a critical component that binds all services together. We have conducted an internal investigation and are taking steps to improve our service.\r\n\r\nDETAILED DESCRIPTION OF IMPACT\r\n\r\nOn Wednesday 2 May, 2018 from 13:47 to 14:08 PDT, traffic between all zones in the us-central1 region and all destinations experienced 12% packet loss. Traffic between us-central1 zones experienced 22% packet loss. Customers may have seen requests succeed to services hosted in us-central1 as loss was not evenly distributed, some connections did not experience any loss while others experienced 100% packet loss.\r\n\r\nROOT CAUSE\r\n\r\nA control plane is used to manage configuration changes to the network fabric connecting zones in us-central1 to each other as well as the Internet. On Wednesday 2 May, 2018 Google Cloud Network engineering began deploying a configuration change using the control plane as part of planned maintenance work. During the deployment, a bad configuration was generated that blackholed a portion of the traffic flowing over the fabric. \r\n\r\nThe control plane had a bug in it, which caused it to produce an incorrect configuration. New configurations deployed to the network fabric are evaluated for correctness, and regenerated if an error is found. In this case, the configuration error appeared after the configuration was evaluated, which resulted in deploying the erroneous configuration to the network fabric.\r\n\r\nREMEDIATION AND PREVENTION\r\n\r\nAutomated monitoring alerted engineering teams 2 minutes after the loss started. Google engineers correlated the alerts to the configuration push and routed traffic away from the affected part of the fabric. Mitigation completed 21 minutes after loss began, ending impact to customers.\r\n\r\nAfter isolating the root cause, engineers then audited all configuration changes that were generated by the control plane and replaced them with known-good configurations.\r\n\r\nTo prevent this from recurring, we will correct the control plane defect that generated the incorrect configuration and are adding additional validation at the fabric layer in order to more robustly detect configuration errors. Additionally, we intend on adding logic to the network control plane to be able to self-heal by automatically routing traffic away from the parts of the network fabric in an error state. Finally, we plan on evaluating further isolation of control plane configuration changes to reduce the size of the possible failure domain.\r\n\r\nAgain, we would like to apologize for this issue. We are taking immediate steps to improve the platform\u2019s performance and availability.", "when": "2018-05-08T15:24:46Z"}, "number": 18007, "public": true, "service_key": "cloud-networking", "service_name": "Google Cloud Networking", "severity": "medium", "updates": [{"created": "2018-05-08T15:24:46Z", "modified": "2018-05-08T15:25:56Z", "text": "ISSUE SUMMARY\r\n\r\nOn Wednesday 2 May, 2018 Google Cloud Networking experienced increased packet loss to the internet as well as other Google regions from the us-central1 region for a duration of 21 minutes. We understand that the network is a critical component that binds all services together. We have conducted an internal investigation and are taking steps to improve our service.\r\n\r\nDETAILED DESCRIPTION OF IMPACT\r\n\r\nOn Wednesday 2 May, 2018 from 13:47 to 14:08 PDT, traffic between all zones in the us-central1 region and all destinations experienced 12% packet loss. Traffic between us-central1 zones experienced 22% packet loss. Customers may have seen requests succeed to services hosted in us-central1 as loss was not evenly distributed, some connections did not experience any loss while others experienced 100% packet loss.\r\n\r\nROOT CAUSE\r\n\r\nA control plane is used to manage configuration changes to the network fabric connecting zones in us-central1 to each other as well as the Internet. On Wednesday 2 May, 2018 Google Cloud Network engineering began deploying a configuration change using the control plane as part of planned maintenance work. During the deployment, a bad configuration was generated that blackholed a portion of the traffic flowing over the fabric. \r\n\r\nThe control plane had a bug in it, which caused it to produce an incorrect configuration. New configurations deployed to the network fabric are evaluated for correctness, and regenerated if an error is found. In this case, the configuration error appeared after the configuration was evaluated, which resulted in deploying the erroneous configuration to the network fabric.\r\n\r\nREMEDIATION AND PREVENTION\r\n\r\nAutomated monitoring alerted engineering teams 2 minutes after the loss started. Google engineers correlated the alerts to the configuration push and routed traffic away from the affected part of the fabric. Mitigation completed 21 minutes after loss began, ending impact to customers.\r\n\r\nAfter isolating the root cause, engineers then audited all configuration changes that were generated by the control plane and replaced them with known-good configurations.\r\n\r\nTo prevent this from recurring, we will correct the control plane defect that generated the incorrect configuration and are adding additional validation at the fabric layer in order to more robustly detect configuration errors. Additionally, we intend on adding logic to the network control plane to be able to self-heal by automatically routing traffic away from the parts of the network fabric in an error state. Finally, we plan on evaluating further isolation of control plane configuration changes to reduce the size of the possible failure domain.\r\n\r\nAgain, we would like to apologize for this issue. We are taking immediate steps to improve the platform\u2019s performance and availability.", "when": "2018-05-08T15:24:46Z"}, {"created": "2018-05-02T21:19:26Z", "modified": "2018-05-02T21:19:26Z", "text": "The issue with Google Cloud Networking having increased packet loss in us-central1 has been resolved for all affected users as of Wednesday, 2018-05-02 14:10 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-05-02T21:19:26Z"}, {"created": "2018-05-02T21:02:43Z", "modified": "2018-05-02T21:02:43Z", "text": "We are investigating an issue with Google Cloud Networking. We will provide more information by 14:45 US/Pacific.", "when": "2018-05-02T21:02:43Z"}], "uri": "/incident/cloud-networking/18007"}, {"begin": "2018-05-02T15:22:17Z", "created": "2018-05-02T15:22:20Z", "end": "2018-05-02T16:07:53Z", "external_desc": "The Cloud Shell availability issue has been resolved for all affected users as of 2018-05-02 08:56 US/Pacific.", "modified": "2018-05-02T16:07:55Z", "most-recent-update": {"created": "2018-05-02T16:07:55Z", "modified": "2018-05-02T16:07:55Z", "text": "The Cloud Shell availability issue has been resolved for all affected users as of 2018-05-02 08:56 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-05-02T16:07:55Z"}, "number": 18002, "public": true, "service_key": "cloud-dev-tools", "service_name": "Cloud Developer Tools", "severity": "high", "updates": [{"created": "2018-05-02T16:07:55Z", "modified": "2018-05-02T16:07:55Z", "text": "The Cloud Shell availability issue has been resolved for all affected users as of 2018-05-02 08:56 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-05-02T16:07:55Z"}, {"created": "2018-05-02T15:42:54Z", "modified": "2018-05-02T15:42:54Z", "text": "Our Engineering Team believes they have identified the potential\r\nroot cause of the issue. We will provide another status\r\nupdate by 09:15 US/Pacific with current details.", "when": "2018-05-02T15:42:54Z"}, {"created": "2018-05-02T15:22:21Z", "modified": "2018-05-02T15:22:21Z", "text": "We are investigating an issue with Cloud Shell. We will provide more information by 08:45 US/Pacific.", "when": "2018-05-02T15:22:21Z"}], "uri": "/incident/cloud-dev-tools/18002"}, {"begin": "2018-04-30T18:52:11Z", "created": "2018-04-30T19:46:14Z", "end": "2018-04-30T20:06:29Z", "external_desc": "Users may be experiencing increased error rates when accessing the Stackdriver web UI", "modified": "2018-04-30T20:06:33Z", "most-recent-update": {"created": "2018-04-30T20:06:33Z", "modified": "2018-04-30T20:06:33Z", "text": "The issue with Stackdriver Web UI Returning 500 and 502 error codes has been resolved for all affected users as of Monday, 2018-04-30 13:01 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-04-30T20:06:33Z"}, "number": 18006, "public": true, "service_key": "google-stackdriver", "service_name": "Google Stackdriver", "severity": "medium", "updates": [{"created": "2018-04-30T20:06:33Z", "modified": "2018-04-30T20:06:33Z", "text": "The issue with Stackdriver Web UI Returning 500 and 502 error codes has been resolved for all affected users as of Monday, 2018-04-30 13:01 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-04-30T20:06:33Z"}, {"created": "2018-04-30T19:46:24Z", "modified": "2018-04-30T19:46:24Z", "text": "Mitigations are proving to be effective, error rates of 500s and 502s are decreasing, though are still elevated. We will provide more information by Monday, 2018-04-30 14:30 US/Pacific.", "when": "2018-04-30T19:46:24Z"}, {"created": "2018-04-30T19:46:21Z", "modified": "2018-04-30T19:46:21Z", "text": "Users may be experiencing increased error rates when accessing the Stackdriver web UI", "when": "2018-04-30T19:46:21Z"}], "uri": "/incident/google-stackdriver/18006"}, {"begin": "2018-04-21T12:05:00Z", "created": "2018-04-21T13:43:29Z", "end": "2018-04-21T16:30:49Z", "external_desc": "We are investigating an issue with Google Cloud Pub/Sub that is affecting message delivery in some regions.", "modified": "2018-04-21T16:30:52Z", "most-recent-update": {"created": "2018-04-21T16:30:51Z", "modified": "2018-04-21T16:30:51Z", "text": "The issue with Google Cloud Pub/Sub message delivery has been resolved for all affected users as of 09:04 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-04-21T16:30:51Z"}, "number": 18002, "public": true, "service_key": "cloud-pubsub", "service_name": "Google Cloud Pub/Sub", "severity": "medium", "updates": [{"created": "2018-04-21T16:30:51Z", "modified": "2018-04-21T16:30:51Z", "text": "The issue with Google Cloud Pub/Sub message delivery has been resolved for all affected users as of 09:04 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-04-21T16:30:51Z"}, {"created": "2018-04-21T15:29:49Z", "modified": "2018-04-21T15:29:49Z", "text": "The mitigation work is currently underway by our Engineering Team and appears to be working. The messages that were delayed during this incidents are starting to get delivered. We expect a full resolution in the near future.", "when": "2018-04-21T15:29:49Z"}, {"created": "2018-04-21T14:16:52Z", "modified": "2018-04-21T14:16:52Z", "text": "Our Engineering Team believes they have identified the root cause of the Cloud Pub/Sub message delivery issues and is working to mitigate.", "when": "2018-04-21T14:16:52Z"}, {"created": "2018-04-21T13:43:37Z", "modified": "2018-04-21T13:43:37Z", "text": "We are still investigating message delivery issues with Google Cloud Pub/Sub. Our Engineering Team is investigating\r\npossible causes. We will provide another status update by 7:30 US/Pacific with current details.", "when": "2018-04-21T13:43:37Z"}, {"created": "2018-04-21T13:43:34Z", "modified": "2018-04-21T13:59:54Z", "text": "We are investigating an issue with Google Cloud Pub/Sub that is affecting message delivery in some regions.\r\n\r\nThis issue is also affecting Google Cloud SQL. Since 02:30 US/Pacific customers may see errors when trying to list/create/change users and databases. Both changes via Google Cloud Console and via gcloud are affected.", "when": "2018-04-21T12:50:00Z"}], "uri": "/incident/cloud-pubsub/18002"}, {"begin": "2018-04-19T00:30:50Z", "created": "2018-04-19T02:59:52Z", "end": "2018-04-19T02:14:53Z", "external_desc": "We are investigating an issue with Google Cloud Dataflow, Dataproc, GCE and GCR.", "modified": "2018-04-19T03:30:38Z", "most-recent-update": {"created": "2018-04-19T03:04:56Z", "modified": "2018-04-19T03:04:56Z", "text": "The issue with Dataflow, Dataproc, Compute Engine and GCR has been resolved for all affected users as of 19:30 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence. We will provide a more detailed analysis of this incident once we have completed our internal investigation.", "when": "2018-04-19T03:04:56Z"}, "number": 18001, "public": true, "service_key": "cloud-dataflow", "service_name": "Google Cloud Dataflow", "severity": "high", "updates": [{"created": "2018-04-19T03:04:56Z", "modified": "2018-04-19T03:04:56Z", "text": "The issue with Dataflow, Dataproc, Compute Engine and GCR has been resolved for all affected users as of 19:30 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence. We will provide a more detailed analysis of this incident once we have completed our internal investigation.", "when": "2018-04-19T03:04:56Z"}, {"created": "2018-04-19T02:59:57Z", "modified": "2018-04-19T02:59:57Z", "text": "The issue with Dataflow, Dataproc, Compute Engine and GCR has been resolved for all affected users as of 19:30 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence. We will provide a more detailed analysis of this incident once we have completed our internal investigation.", "when": "2018-04-19T02:59:57Z"}, {"created": "2018-04-19T02:59:54Z", "modified": "2018-04-19T02:59:54Z", "text": "We are investigating an issue with Google Cloud Dataflow, Dataproc, GCE and GCR. We will provide more information by 20:00 US/Pacific.", "when": "2018-04-19T02:59:54Z"}], "uri": "/incident/cloud-dataflow/18001"}, {"begin": "2018-04-17T23:51:24Z", "created": "2018-04-17T23:51:25Z", "end": "2018-04-18T14:31:52Z", "external_desc": "The issue with StackDriver Logging GCS Exports is resolved. We have finished processing the backlog of GCS Export jobs.", "modified": "2018-04-18T14:31:56Z", "most-recent-update": {"created": "2018-04-18T14:31:56Z", "modified": "2018-04-18T14:31:56Z", "text": "The issue with StackDriver Logging GCS Exports has been resolved for all affected users. We have completed processing the backlog of GCS export jobs. An internal investigation has been started to make the appropriate improvements to our systems and help prevent or minimize future recurrence.", "when": "2018-04-18T14:31:56Z"}, "number": 18005, "public": true, "service_key": "google-stackdriver", "service_name": "Google Stackdriver", "severity": "high", "updates": [{"created": "2018-04-18T14:31:56Z", "modified": "2018-04-18T14:31:56Z", "text": "The issue with StackDriver Logging GCS Exports has been resolved for all affected users. We have completed processing the backlog of GCS export jobs. An internal investigation has been started to make the appropriate improvements to our systems and help prevent or minimize future recurrence.", "when": "2018-04-18T14:31:56Z"}, {"created": "2018-04-18T08:09:50Z", "modified": "2018-04-18T08:09:50Z", "text": "The issue with StackDriver Logging GCS Export is mitigated and backlog processing is ongoing. We will provide another update by 08:00 US/Pacific with current status of the backlog.", "when": "2018-04-18T08:09:50Z"}, {"created": "2018-04-18T05:03:57Z", "modified": "2018-04-18T05:03:57Z", "text": "The issue with StackDriver Logging GCS Export is mitigated and backlog processing is ongoing. We will provide another update by 01:00 US/Pacific with current status of the backlog.", "when": "2018-04-18T05:03:57Z"}, {"created": "2018-04-18T02:07:56Z", "modified": "2018-04-18T02:07:56Z", "text": "Our Engineering Team believes they have identified and mitigated the root cause of the delay on StackDriver Logging GCS Export service. We are actively working to process the backlogs. We will provide an update by 22:00 US/Pacific with current progress.", "when": "2018-04-18T02:07:56Z"}, {"created": "2018-04-18T00:47:18Z", "modified": "2018-04-18T00:47:18Z", "text": "Mitigation work is still underway by our Engineering Team to address the delay issue with Google StackDriver Logging GCS Export. We will provide more information by 18:45 US/Pacific.", "when": "2018-04-18T00:47:18Z"}, {"created": "2018-04-18T00:06:58Z", "modified": "2018-04-18T00:06:58Z", "text": "Mitigation work is currently underway by our Engineering Team to address the issue with Google StackDriver Logging GCS Export. We will provide more information by 17:45 US/Pacific.", "when": "2018-04-18T00:06:58Z"}, {"created": "2018-04-17T23:51:27Z", "modified": "2018-04-17T23:51:27Z", "text": "We are investigating an issue with Google StackDriver Logging GCS Export. We will provide more information by 17:15 US/Pacific.", "when": "2018-04-17T23:51:27Z"}], "uri": "/incident/google-stackdriver/18005"}, {"begin": "2018-03-29T12:29:29Z", "created": "2018-03-29T12:29:31Z", "end": "2018-03-29T12:29:34Z", "external_desc": "We've received a report of an issue with Google App Engine as of 2018-03-29 04:52 US/Pacific. We will provide more information by 05:30 US/Pacific.", "modified": "2018-03-29T14:22:50Z", "most-recent-update": {"created": "2018-03-29T12:29:36Z", "modified": "2018-03-29T14:22:50Z", "text": "The issue with Cloud Datastore in europe-west2 has been resolved for all affected as of 5.03 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-03-29T12:29:36Z"}, "number": 18002, "public": true, "service_key": "cloud-datastore", "service_name": "Google Cloud Datastore", "severity": "medium", "updates": [{"created": "2018-03-29T12:29:36Z", "modified": "2018-03-29T14:22:50Z", "text": "The issue with Cloud Datastore in europe-west2 has been resolved for all affected as of 5.03 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-03-29T12:29:36Z"}, {"created": "2018-03-29T12:29:33Z", "modified": "2018-03-29T12:29:33Z", "text": "", "when": "2018-03-29T12:29:33Z"}], "uri": "/incident/cloud-datastore/18002"}, {"begin": "2018-03-16T19:06:18Z", "created": "2018-03-16T19:06:19Z", "end": "2018-03-16T21:00:00Z", "external_desc": "We've received a report of an issue with Google Compute Engine as of 2018-03-16 11:32 US/Pacific. We will provide more information by 12:15 US/Pacific.", "modified": "2018-03-16T21:00:04Z", "most-recent-update": {"created": "2018-03-16T21:00:03Z", "modified": "2018-03-16T21:00:03Z", "text": "The issue with slow network programming is resolved for all zones in us-east1 as of 12:44 US/Pacific.  We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-03-16T21:00:03Z"}, "number": 18003, "public": true, "service_key": "compute", "service_name": "Google Compute Engine", "severity": "medium", "updates": [{"created": "2018-03-16T21:00:03Z", "modified": "2018-03-16T21:00:03Z", "text": "The issue with slow network programming is resolved for all zones in us-east1 as of 12:44 US/Pacific.  We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-03-16T21:00:03Z"}, {"created": "2018-03-16T20:06:29Z", "modified": "2018-03-16T20:06:29Z", "text": "The issue with slow network programming should be resolved for all zones in us-east1 as of 12:44 US/Pacific.  The root cause has been identified and we are working to prevent a recurrence.  We will provide more information by 14:00 US/Pacific.", "when": "2018-03-16T20:06:29Z"}, {"created": "2018-03-16T19:06:26Z", "modified": "2018-03-16T19:06:26Z", "text": "The Google Compute Engine service is experiencing delays in network programming in us-east1. We will provide another status update by 13:00 US/Pacific with current details.", "when": "2018-03-16T19:06:26Z"}, {"created": "2018-03-16T19:06:22Z", "modified": "2018-03-16T19:06:22Z", "text": "", "when": "2018-03-16T19:06:22Z"}], "uri": "/incident/compute/18003"}, {"begin": "2018-03-14T01:02:44Z", "created": "2018-03-14T01:02:46Z", "end": "2018-03-14T02:23:21Z", "external_desc": "We are investigating an issue with Google Cloud Shell as of 2018-03-13 17:44 US/Pacific.", "modified": "2018-03-14T02:23:23Z", "most-recent-update": {"created": "2018-03-14T02:23:23Z", "modified": "2018-03-14T02:23:23Z", "text": "The issue with Cloud Shell has been resolved for all affected users as of Tue 2018-03-13 19:00 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-03-14T02:23:23Z"}, "number": 18001, "public": true, "service_key": "developers-console", "service_name": "Google Cloud Console", "severity": "high", "updates": [{"created": "2018-03-14T02:23:23Z", "modified": "2018-03-14T02:23:23Z", "text": "The issue with Cloud Shell has been resolved for all affected users as of Tue 2018-03-13 19:00 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-03-14T02:23:23Z"}, {"created": "2018-03-14T01:30:58Z", "modified": "2018-03-14T01:30:58Z", "text": "We are still investigating an issue with Google Cloud Shell starting 2018-03-13 17:44 US/Pacific. We will provide more information by 19:30 US/Pacific.", "when": "2018-03-14T01:30:58Z"}, {"created": "2018-03-14T01:02:55Z", "modified": "2018-03-14T01:02:55Z", "text": "We are investigating an issue with Google Cloud Shell as of 2018-03-13 17:44 US/Pacific. We will provide more information by 18:30 US/Pacific.", "when": "2018-03-14T01:02:55Z"}, {"created": "2018-03-14T01:02:48Z", "modified": "2018-03-14T01:02:48Z", "text": "", "when": "2018-03-14T01:02:48Z"}], "uri": "/incident/developers-console/18001"}, {"begin": "2018-03-11T05:15:00Z", "created": "2018-03-11T05:15:02Z", "end": "2018-03-11T05:57:45Z", "external_desc": "The issue with Google Compute Engine Nested Virtualization's (beta) creation of new instances failing to persist vmx license has been mitigated for all affected projects as of 21:48 US/Pacific. We will make appropriate improvements to our systems to help prevent or minimize future recurrence.", "modified": "2018-03-11T05:57:49Z", "most-recent-update": {"created": "2018-03-11T05:57:48Z", "modified": "2018-03-11T05:57:48Z", "text": "The issue with Google Compute Engine Nested Virtualization's (beta) creation of new instances failing to persist vmx license has been mitigated for all affected projects as of 21:48 US/Pacific. We will make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-03-11T05:57:48Z"}, "number": 18002, "public": true, "service_key": "compute", "service_name": "Google Compute Engine", "severity": "medium", "updates": [{"created": "2018-03-11T05:57:48Z", "modified": "2018-03-11T05:57:48Z", "text": "The issue with Google Compute Engine Nested Virtualization's (beta) creation of new instances failing to persist vmx license has been mitigated for all affected projects as of 21:48 US/Pacific. We will make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-03-11T05:57:48Z"}, {"created": "2018-03-11T05:15:10Z", "modified": "2018-03-11T05:15:10Z", "text": "We've received a report of an issue with Google Compute Engine Nested virtualization as of 2018-03-10 21:06 US/Pacific. Mitigation work is currently underway by our Engineering Team. We will provide more information by 22:00 US/Pacific.", "when": "2018-03-11T05:15:10Z"}, {"created": "2018-03-11T05:15:05Z", "modified": "2018-03-11T05:15:05Z", "text": "", "when": "2018-03-11T05:15:05Z"}], "uri": "/incident/compute/18002"}, {"begin": "2018-02-24T06:19:40Z", "created": "2018-02-24T06:19:42Z", "end": "2018-02-24T06:36:34Z", "external_desc": "The issue with Google Cloud Networking intermittent traffic disruption to and from us-central has been resolved for all affected users as of 2018-02-23 22:35 US/Pacific.", "modified": "2018-02-24T06:36:38Z", "most-recent-update": {"created": "2018-02-24T06:36:38Z", "modified": "2018-02-24T06:36:38Z", "text": "The issue with Google Cloud Networking intermittent traffic disruption to and from us-central has been resolved for all affected users as of 2018-02-23 22:35 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-02-24T06:36:38Z"}, "number": 18006, "public": true, "service_key": "cloud-networking", "service_name": "Google Cloud Networking", "severity": "high", "updates": [{"created": "2018-02-24T06:36:38Z", "modified": "2018-02-24T06:36:38Z", "text": "The issue with Google Cloud Networking intermittent traffic disruption to and from us-central has been resolved for all affected users as of 2018-02-23 22:35 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-02-24T06:36:38Z"}, {"created": "2018-02-24T06:30:29Z", "modified": "2018-02-24T06:30:29Z", "text": "The issue with Google Cloud Networking intermittent traffic disruption to and from us-central. should now show sign of recovery for the\r\nmajority of users and we expect a full resolution in the near future.\r\n\r\nWe will provide another status update by Friday, 2018-02-23 22:40 US/Pacific. US/Pacific with current details.", "when": "2018-02-24T06:30:29Z"}, {"created": "2018-02-24T06:19:47Z", "modified": "2018-02-24T06:19:47Z", "text": "We are investigating an issue with Google Cloud Networking. The issue started around Friday, 2018-02-23 21:40 US/Pacific. This affects traffic to and from us-central. We are rolling out a potential fix to mitigate this issue. We will provide more information by Friday, 2018-02-23 22:40 US/Pacific.", "when": "2018-02-24T06:19:47Z"}, {"created": "2018-02-24T06:19:43Z", "modified": "2018-02-24T06:19:43Z", "text": "", "when": "2018-02-24T06:19:43Z"}], "uri": "/incident/cloud-networking/18006"}, {"begin": "2018-02-24T05:53:40Z", "created": "2018-02-24T05:53:42Z", "end": "2018-02-24T06:42:05Z", "external_desc": "We are investigating an issue with Google Cloud Networking. We will provide more information by 22:30 US/Pacific.", "modified": "2018-02-24T06:42:05Z", "most-recent-update": {"created": "2018-02-24T06:42:05Z", "modified": "2018-02-24T06:42:05Z", "text": "The issue with Google Cloud Networking intermittent traffic disruption to and from us-central has been resolved for all affected users as of 2018-02-23 22:35 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.\r\n\r\n", "when": "2018-02-24T06:42:05Z"}, "number": 18005, "public": true, "service_key": "cloud-networking", "service_name": "Google Cloud Networking", "severity": "medium", "updates": [{"created": "2018-02-24T06:42:05Z", "modified": "2018-02-24T06:42:05Z", "text": "The issue with Google Cloud Networking intermittent traffic disruption to and from us-central has been resolved for all affected users as of 2018-02-23 22:35 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.\r\n\r\n", "when": "2018-02-24T06:42:05Z"}, {"created": "2018-02-24T06:01:49Z", "modified": "2018-02-24T06:01:49Z", "text": "We are rolling out a potential fix to mitigate this issue. The affected region seem to be us-central.", "when": "2018-02-24T06:01:49Z"}, {"created": "2018-02-24T05:53:44Z", "modified": "2018-02-24T05:53:44Z", "text": "We are investigating an issue with Google Cloud Networking. We will provide more information by 22:30 US/Pacific.", "when": "2018-02-24T05:53:44Z"}], "uri": "/incident/cloud-networking/18005"}, {"begin": "2018-02-20T11:02:42Z", "created": "2018-02-20T11:02:45Z", "end": "2018-02-20T13:30:22Z", "external_desc": "Cloud PubSub experiencing missing subscription metrics. Additionally, some Dataflow jobs with PubSub sources appear as they do not consume any messages.", "modified": "2018-02-20T13:30:27Z", "most-recent-update": {"created": "2018-02-20T13:30:27Z", "modified": "2018-02-20T13:30:27Z", "text": "The issue with Cloud PubSub causing watermark increase in Dataflow jobs has been resolved for all affected projects as of Tue, 2018-02-20 05:30 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-02-20T13:30:27Z"}, "number": 18001, "public": true, "service_key": "cloud-pubsub", "service_name": "Google Cloud Pub/Sub", "severity": "medium", "updates": [{"created": "2018-02-20T13:30:27Z", "modified": "2018-02-20T13:30:27Z", "text": "The issue with Cloud PubSub causing watermark increase in Dataflow jobs has been resolved for all affected projects as of Tue, 2018-02-20 05:30 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-02-20T13:30:27Z"}, {"created": "2018-02-20T13:03:57Z", "modified": "2018-02-20T13:03:57Z", "text": "The watermarks of the affected Dataflow jobs using PubSub are now returning to normal.", "when": "2018-02-20T13:03:57Z"}, {"created": "2018-02-20T12:07:57Z", "modified": "2018-02-20T12:07:57Z", "text": "We are experiencing an issue with Cloud PubSub beginning approximately at 20:00 2018-02-19 US/Pacific. Early investigation indicates that approximately 10-15% of Dataflow jobs are affected by this issue. For everyone who is affected, we apologize for the disruption. We will provide an update by 05:30 US/Pacific with current details.", "when": "2018-02-20T12:07:57Z"}, {"created": "2018-02-20T11:02:52Z", "modified": "2018-02-20T11:02:52Z", "text": "We are investigating an issue with Cloud PubSub. We will provide more information by 04:00 AM US/Pacific.", "when": "2018-02-20T11:02:52Z"}, {"created": "2018-02-20T11:02:47Z", "modified": "2018-02-20T11:02:47Z", "text": "", "when": "2018-02-20T11:02:47Z"}], "uri": "/incident/cloud-pubsub/18001"}, {"begin": "2018-02-15T21:22:53Z", "created": "2018-02-15T21:22:54Z", "end": "2018-02-15T21:22:58Z", "external_desc": "We are investigating an issue with Google Stackdriver. We will provide more information by 13:15 US/Pacific.", "modified": "2018-02-15T21:23:02Z", "most-recent-update": {"created": "2018-02-15T21:23:02Z", "modified": "2018-02-15T21:23:02Z", "text": "The issue with Google Stackdriver has been resolved for all affected projects as of 13:00 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-02-15T21:23:02Z"}, "number": 18004, "public": true, "service_key": "google-stackdriver", "service_name": "Google Stackdriver", "severity": "medium", "updates": [{"created": "2018-02-15T21:23:02Z", "modified": "2018-02-15T21:23:02Z", "text": "The issue with Google Stackdriver has been resolved for all affected projects as of 13:00 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-02-15T21:23:02Z"}, {"created": "2018-02-15T21:22:57Z", "modified": "2018-02-15T21:22:57Z", "text": "", "when": "2018-02-15T21:22:57Z"}], "uri": "/incident/google-stackdriver/18004"}, {"begin": "2018-02-15T20:49:00Z", "created": "2018-02-15T20:49:45Z", "end": "2018-02-15T21:37:06Z", "external_desc": "Stackdriver Logging Service Degraded", "modified": "2018-02-15T21:37:07Z", "most-recent-update": {"created": "2018-02-15T21:37:06Z", "modified": "2018-02-15T21:37:06Z", "text": "\t\r\nThe issue with Google Stackdriver has been resolved for all affected projects as of 13:00 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-02-15T21:37:06Z"}, "number": 18003, "public": true, "service_key": "google-stackdriver", "service_name": "Google Stackdriver", "severity": "medium", "updates": [{"created": "2018-02-15T21:37:06Z", "modified": "2018-02-15T21:37:06Z", "text": "\t\r\nThe issue with Google Stackdriver has been resolved for all affected projects as of 13:00 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-02-15T21:37:06Z"}, {"created": "2018-02-15T20:49:45Z", "modified": "2018-02-15T20:49:45Z", "text": "We are investigating an issue with Stackdriver Logging Service. We will provide more information by 13:25 US/Pacific.", "when": "2018-02-15T20:49:45Z"}], "uri": "/incident/google-stackdriver/18003"}, {"begin": "2018-02-15T19:42:00Z", "created": "2018-02-15T20:41:55Z", "end": "2018-02-15T20:44:00Z", "external_desc": "App Engine seeing elevated error rates", "modified": "2018-02-22T15:13:27Z", "most-recent-update": {"created": "2018-02-22T15:13:27Z", "modified": "2018-02-22T15:13:27Z", "text": "On Thursday 15 February 2018, specific Google Cloud Platform services experienced elevated errors and latency for a period of 62 minutes from 11:42 to 12:44 PST. The following services were impacted:\r\n\r\nCloud Datastore experienced a 4% error rate for get calls and an 88% error rate for put calls.\r\n\r\nApp Engine's serving infrastructure, which is responsible for routing requests to instances, experienced a 45% error rate, most of which were timeouts.\r\n\r\nApp Engine Task Queues would not accept new transactional tasks, and also would not accept new tasks in regions outside us-central1 and europe-west1. Tasks continued to be dispatched during the event but saw start delays of 0-30 minutes; additionally, a fraction of tasks executed with errors due to the aforementioned Cloud Datastore and App Engine performance issues.\r\n\r\nApp Engine Memcache calls experienced a 5% error rate.\r\n\r\nApp Engine Admin API write calls failed during the incident, causing unsuccessful application deployments. App Engine Admin API read calls experienced a 13% error rate.\r\n\r\nApp Engine Search API index writes failed during the incident though search queries did not experience elevated errors.\r\n\r\nStackdriver Logging experienced delays exporting logs to systems including Cloud Console Logs Viewer, BigQuery and Cloud Pub/Sub. Stackdriver Logging retries on failure so no logs were lost during the incident. Logs-based Metrics failed to post some points during the incident.\r\n\r\nWe apologize for the impact of this outage on your application or service. For Google Cloud Platform customers who rely on the products which were part of this event, the impact was substantial and we recognize that it caused significant disruption for those customers. We are conducting a detailed post-mortem to ensure that all the root and contributing causes of this event are understood and addressed promptly. \r\n", "when": "2018-02-22T15:13:27Z"}, "number": 18003, "public": true, "service_key": "appengine", "service_name": "Google App Engine", "severity": "high", "updates": [{"created": "2018-02-22T15:13:27Z", "modified": "2018-02-22T15:13:27Z", "text": "On Thursday 15 February 2018, specific Google Cloud Platform services experienced elevated errors and latency for a period of 62 minutes from 11:42 to 12:44 PST. The following services were impacted:\r\n\r\nCloud Datastore experienced a 4% error rate for get calls and an 88% error rate for put calls.\r\n\r\nApp Engine's serving infrastructure, which is responsible for routing requests to instances, experienced a 45% error rate, most of which were timeouts.\r\n\r\nApp Engine Task Queues would not accept new transactional tasks, and also would not accept new tasks in regions outside us-central1 and europe-west1. Tasks continued to be dispatched during the event but saw start delays of 0-30 minutes; additionally, a fraction of tasks executed with errors due to the aforementioned Cloud Datastore and App Engine performance issues.\r\n\r\nApp Engine Memcache calls experienced a 5% error rate.\r\n\r\nApp Engine Admin API write calls failed during the incident, causing unsuccessful application deployments. App Engine Admin API read calls experienced a 13% error rate.\r\n\r\nApp Engine Search API index writes failed during the incident though search queries did not experience elevated errors.\r\n\r\nStackdriver Logging experienced delays exporting logs to systems including Cloud Console Logs Viewer, BigQuery and Cloud Pub/Sub. Stackdriver Logging retries on failure so no logs were lost during the incident. Logs-based Metrics failed to post some points during the incident.\r\n\r\nWe apologize for the impact of this outage on your application or service. For Google Cloud Platform customers who rely on the products which were part of this event, the impact was substantial and we recognize that it caused significant disruption for those customers. We are conducting a detailed post-mortem to ensure that all the root and contributing causes of this event are understood and addressed promptly. \r\n", "when": "2018-02-22T15:13:27Z"}, {"created": "2018-02-15T22:04:36Z", "modified": "2018-02-15T23:08:49Z", "text": "The issue with App Engine has been resolved for all affected\r\nprojects as of 12:44 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.\r\n\r\nWe will provide a more detailed analysis of this incident once we have completed\r\nour internal investigation.", "when": "2018-02-15T22:03:00Z"}, {"created": "2018-02-15T21:04:29Z", "modified": "2018-02-15T21:04:29Z", "text": "We're seeing widespread improvement in error rates in many / most regions since ~12:40 PST. We're continuing to investigate and will provide another update by 13:30 PST.", "when": "2018-02-15T21:04:29Z"}, {"created": "2018-02-15T20:41:55Z", "modified": "2018-02-15T20:41:55Z", "text": "We are investigating an issue with App Engine. We will provide more information by 13:00 US/Pacific.", "when": "2018-02-15T20:41:55Z"}], "uri": "/incident/appengine/18003"}, {"begin": "2018-02-15T19:42:00Z", "created": "2018-02-15T20:43:41Z", "end": "2018-02-15T20:44:57Z", "external_desc": "Bigquery experiencing high latency rates", "modified": "2018-02-22T15:15:19Z", "most-recent-update": {"created": "2018-02-22T15:15:19Z", "modified": "2018-02-22T15:15:19Z", "text": "On Thursday 15 February 2018, specific Google Cloud Platform services experienced elevated errors and latency for a period of 62 minutes from 11:42 to 12:44 PST. The following services were impacted:\r\n\r\nCloud Datastore experienced a 4% error rate for get calls and an 88% error rate for put calls.\r\n\r\nApp Engine's serving infrastructure, which is responsible for routing requests to instances, experienced a 45% error rate, most of which were timeouts.\r\n\r\nApp Engine Task Queues would not accept new transactional tasks, and also would not accept new tasks in regions outside us-central1 and europe-west1. Tasks continued to be dispatched during the event but saw start delays of 0-30 minutes; additionally, a fraction of tasks executed with errors due to the aforementioned Cloud Datastore and App Engine performance issues.\r\n\r\nApp Engine Memcache calls experienced a 5% error rate.\r\n\r\nApp Engine Admin API write calls failed during the incident, causing unsuccessful application deployments. App Engine Admin API read calls experienced a 13% error rate.\r\n\r\nApp Engine Search API index writes failed during the incident though search queries did not experience elevated errors.\r\n\r\nStackdriver Logging experienced delays exporting logs to systems including Cloud Console Logs Viewer, BigQuery and Cloud Pub/Sub. Stackdriver Logging retries on failure so no logs were lost during the incident. Logs-based Metrics failed to post some points during the incident.\r\n\r\nWe apologize for the impact of this outage on your application or service. For Google Cloud Platform customers who rely on the products which were part of this event, the impact was substantial and we recognize that it caused significant disruption for those customers. We are conducting a detailed post-mortem to ensure that all the root and contributing causes of this event are understood and addressed promptly.", "when": "2018-02-22T15:15:19Z"}, "number": 18035, "public": true, "service_key": "bigquery", "service_name": "Google BigQuery", "severity": "medium", "updates": [{"created": "2018-02-22T15:15:19Z", "modified": "2018-02-22T15:15:19Z", "text": "On Thursday 15 February 2018, specific Google Cloud Platform services experienced elevated errors and latency for a period of 62 minutes from 11:42 to 12:44 PST. The following services were impacted:\r\n\r\nCloud Datastore experienced a 4% error rate for get calls and an 88% error rate for put calls.\r\n\r\nApp Engine's serving infrastructure, which is responsible for routing requests to instances, experienced a 45% error rate, most of which were timeouts.\r\n\r\nApp Engine Task Queues would not accept new transactional tasks, and also would not accept new tasks in regions outside us-central1 and europe-west1. Tasks continued to be dispatched during the event but saw start delays of 0-30 minutes; additionally, a fraction of tasks executed with errors due to the aforementioned Cloud Datastore and App Engine performance issues.\r\n\r\nApp Engine Memcache calls experienced a 5% error rate.\r\n\r\nApp Engine Admin API write calls failed during the incident, causing unsuccessful application deployments. App Engine Admin API read calls experienced a 13% error rate.\r\n\r\nApp Engine Search API index writes failed during the incident though search queries did not experience elevated errors.\r\n\r\nStackdriver Logging experienced delays exporting logs to systems including Cloud Console Logs Viewer, BigQuery and Cloud Pub/Sub. Stackdriver Logging retries on failure so no logs were lost during the incident. Logs-based Metrics failed to post some points during the incident.\r\n\r\nWe apologize for the impact of this outage on your application or service. For Google Cloud Platform customers who rely on the products which were part of this event, the impact was substantial and we recognize that it caused significant disruption for those customers. We are conducting a detailed post-mortem to ensure that all the root and contributing causes of this event are understood and addressed promptly.", "when": "2018-02-22T15:15:19Z"}, {"created": "2018-02-15T21:09:57Z", "modified": "2018-02-15T21:09:57Z", "text": "The issue with Bigquery has been resolved for all affected projects as of Thursday, 2018-02-15 13:00 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-02-15T21:09:57Z"}, {"created": "2018-02-15T20:43:41Z", "modified": "2018-02-15T20:43:41Z", "text": "We are investigating an issue with Bigquery. We will provide more information by 13:00 US/Pacific.", "when": "2018-02-15T20:43:41Z"}], "uri": "/incident/bigquery/18035"}, {"begin": "2018-02-15T19:42:00Z", "created": "2018-02-15T20:39:05Z", "end": "2018-02-15T20:44:25Z", "external_desc": "Datastore experiencing elevated error rates", "modified": "2018-02-22T15:16:16Z", "most-recent-update": {"created": "2018-02-22T15:16:15Z", "modified": "2018-02-22T15:16:15Z", "text": "On Thursday 15 February 2018, specific Google Cloud Platform services experienced elevated errors and latency for a period of 62 minutes from 11:42 to 12:44 PST. The following services were impacted:\r\n\r\nCloud Datastore experienced a 4% error rate for get calls and an 88% error rate for put calls.\r\n\r\nApp Engine's serving infrastructure, which is responsible for routing requests to instances, experienced a 45% error rate, most of which were timeouts.\r\n\r\nApp Engine Task Queues would not accept new transactional tasks, and also would not accept new tasks in regions outside us-central1 and europe-west1. Tasks continued to be dispatched during the event but saw start delays of 0-30 minutes; additionally, a fraction of tasks executed with errors due to the aforementioned Cloud Datastore and App Engine performance issues.\r\n\r\nApp Engine Memcache calls experienced a 5% error rate.\r\n\r\nApp Engine Admin API write calls failed during the incident, causing unsuccessful application deployments. App Engine Admin API read calls experienced a 13% error rate.\r\n\r\nApp Engine Search API index writes failed during the incident though search queries did not experience elevated errors.\r\n\r\nStackdriver Logging experienced delays exporting logs to systems including Cloud Console Logs Viewer, BigQuery and Cloud Pub/Sub. Stackdriver Logging retries on failure so no logs were lost during the incident. Logs-based Metrics failed to post some points during the incident.\r\n\r\nWe apologize for the impact of this outage on your application or service. For Google Cloud Platform customers who rely on the products which were part of this event, the impact was substantial and we recognize that it caused significant disruption for those customers. We are conducting a detailed post-mortem to ensure that all the root and contributing causes of this event are understood and addressed promptly.", "when": "2018-02-22T15:16:15Z"}, "number": 18001, "public": true, "service_key": "cloud-datastore", "service_name": "Google Cloud Datastore", "severity": "medium", "updates": [{"created": "2018-02-22T15:16:15Z", "modified": "2018-02-22T15:16:15Z", "text": "On Thursday 15 February 2018, specific Google Cloud Platform services experienced elevated errors and latency for a period of 62 minutes from 11:42 to 12:44 PST. The following services were impacted:\r\n\r\nCloud Datastore experienced a 4% error rate for get calls and an 88% error rate for put calls.\r\n\r\nApp Engine's serving infrastructure, which is responsible for routing requests to instances, experienced a 45% error rate, most of which were timeouts.\r\n\r\nApp Engine Task Queues would not accept new transactional tasks, and also would not accept new tasks in regions outside us-central1 and europe-west1. Tasks continued to be dispatched during the event but saw start delays of 0-30 minutes; additionally, a fraction of tasks executed with errors due to the aforementioned Cloud Datastore and App Engine performance issues.\r\n\r\nApp Engine Memcache calls experienced a 5% error rate.\r\n\r\nApp Engine Admin API write calls failed during the incident, causing unsuccessful application deployments. App Engine Admin API read calls experienced a 13% error rate.\r\n\r\nApp Engine Search API index writes failed during the incident though search queries did not experience elevated errors.\r\n\r\nStackdriver Logging experienced delays exporting logs to systems including Cloud Console Logs Viewer, BigQuery and Cloud Pub/Sub. Stackdriver Logging retries on failure so no logs were lost during the incident. Logs-based Metrics failed to post some points during the incident.\r\n\r\nWe apologize for the impact of this outage on your application or service. For Google Cloud Platform customers who rely on the products which were part of this event, the impact was substantial and we recognize that it caused significant disruption for those customers. We are conducting a detailed post-mortem to ensure that all the root and contributing causes of this event are understood and addressed promptly.", "when": "2018-02-22T15:16:15Z"}, {"created": "2018-02-15T22:02:25Z", "modified": "2018-02-15T22:02:25Z", "text": "The issue with Datastore has been resolved for all affected projects as of Thursday, 2018-02-15 13:45 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence. ", "when": "2018-02-15T22:02:25Z"}, {"created": "2018-02-15T21:52:16Z", "modified": "2018-02-15T21:52:16Z", "text": "We continue to see significant improvement to all datastore services. We are still continuing to monitor and will provide another update by 15:00 PST.", "when": "2018-02-15T21:52:16Z"}, {"created": "2018-02-15T21:14:28Z", "modified": "2018-02-15T21:14:28Z", "text": "We are seeing near return to baseline, however we aren't seeing a consistent view of our quota and are investigating. We will provide another update by roughly 13:45 PST.", "when": "2018-02-15T21:14:28Z"}, {"created": "2018-02-15T20:39:05Z", "modified": "2018-02-15T20:39:05Z", "text": "We are investigating an issue with Datastore. We will provide more information by 13:00 US/Pacific.", "when": "2018-02-15T20:39:05Z"}], "uri": "/incident/cloud-datastore/18001"}, {"begin": "2018-02-15T19:42:00Z", "created": "2018-02-15T20:59:31Z", "end": "2018-02-15T20:44:23Z", "external_desc": "App Engine Admin-API experiencing high error rates", "modified": "2018-02-22T15:17:13Z", "most-recent-update": {"created": "2018-02-22T15:17:12Z", "modified": "2018-02-22T15:17:12Z", "text": "On Thursday 15 February 2018, specific Google Cloud Platform services experienced elevated errors and latency for a period of 62 minutes from 11:42 to 12:44 PST. The following services were impacted:\r\n\r\nCloud Datastore experienced a 4% error rate for get calls and an 88% error rate for put calls.\r\n\r\nApp Engine's serving infrastructure, which is responsible for routing requests to instances, experienced a 45% error rate, most of which were timeouts.\r\n\r\nApp Engine Task Queues would not accept new transactional tasks, and also would not accept new tasks in regions outside us-central1 and europe-west1. Tasks continued to be dispatched during the event but saw start delays of 0-30 minutes; additionally, a fraction of tasks executed with errors due to the aforementioned Cloud Datastore and App Engine performance issues.\r\n\r\nApp Engine Memcache calls experienced a 5% error rate.\r\n\r\nApp Engine Admin API write calls failed during the incident, causing unsuccessful application deployments. App Engine Admin API read calls experienced a 13% error rate.\r\n\r\nApp Engine Search API index writes failed during the incident though search queries did not experience elevated errors.\r\n\r\nStackdriver Logging experienced delays exporting logs to systems including Cloud Console Logs Viewer, BigQuery and Cloud Pub/Sub. Stackdriver Logging retries on failure so no logs were lost during the incident. Logs-based Metrics failed to post some points during the incident.\r\n\r\nWe apologize for the impact of this outage on your application or service. For Google Cloud Platform customers who rely on the products which were part of this event, the impact was substantial and we recognize that it caused significant disruption for those customers. We are conducting a detailed post-mortem to ensure that all the root and contributing causes of this event are understood and addressed promptly.", "when": "2018-02-22T15:17:12Z"}, "number": 18001, "public": true, "service_key": "cloud-dev-tools", "service_name": "Cloud Developer Tools", "severity": "medium", "updates": [{"created": "2018-02-22T15:17:12Z", "modified": "2018-02-22T15:17:12Z", "text": "On Thursday 15 February 2018, specific Google Cloud Platform services experienced elevated errors and latency for a period of 62 minutes from 11:42 to 12:44 PST. The following services were impacted:\r\n\r\nCloud Datastore experienced a 4% error rate for get calls and an 88% error rate for put calls.\r\n\r\nApp Engine's serving infrastructure, which is responsible for routing requests to instances, experienced a 45% error rate, most of which were timeouts.\r\n\r\nApp Engine Task Queues would not accept new transactional tasks, and also would not accept new tasks in regions outside us-central1 and europe-west1. Tasks continued to be dispatched during the event but saw start delays of 0-30 minutes; additionally, a fraction of tasks executed with errors due to the aforementioned Cloud Datastore and App Engine performance issues.\r\n\r\nApp Engine Memcache calls experienced a 5% error rate.\r\n\r\nApp Engine Admin API write calls failed during the incident, causing unsuccessful application deployments. App Engine Admin API read calls experienced a 13% error rate.\r\n\r\nApp Engine Search API index writes failed during the incident though search queries did not experience elevated errors.\r\n\r\nStackdriver Logging experienced delays exporting logs to systems including Cloud Console Logs Viewer, BigQuery and Cloud Pub/Sub. Stackdriver Logging retries on failure so no logs were lost during the incident. Logs-based Metrics failed to post some points during the incident.\r\n\r\nWe apologize for the impact of this outage on your application or service. For Google Cloud Platform customers who rely on the products which were part of this event, the impact was substantial and we recognize that it caused significant disruption for those customers. We are conducting a detailed post-mortem to ensure that all the root and contributing causes of this event are understood and addressed promptly.", "when": "2018-02-22T15:17:12Z"}, {"created": "2018-02-15T21:42:23Z", "modified": "2018-02-15T21:42:23Z", "text": "The issue with App Engine Admin API has been resolved for all affected users as of Thursday, 2018-02-15 13:30 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-02-15T21:42:23Z"}, {"created": "2018-02-15T20:59:31Z", "modified": "2018-02-15T20:59:31Z", "text": "We are investigating an issue with App Engine Admin API. We will provide more information by 13:30 US/Pacific.", "when": "2018-02-15T20:59:31Z"}], "uri": "/incident/cloud-dev-tools/18001"}, {"begin": "2018-02-11T02:32:35Z", "created": "2018-02-11T02:32:36Z", "end": "2018-02-11T03:27:39Z", "external_desc": "The issue with reduced URLFetch availability has been resolved for all affected projects as of 2018-02-10 18:55 US/Pacific. We will provide a more detailed analysis of this incident once we have completed our internal investigation.", "modified": "2018-02-11T03:27:40Z", "most-recent-update": {"created": "2018-02-11T03:27:39Z", "modified": "2018-02-11T03:27:39Z", "text": "The issue with reduced URLFetch availability has been resolved for all affected projects as of 2018-02-10 18:55 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence. We will provide a more detailed analysis of this incident once we have completed our internal investigation.", "when": "2018-02-11T03:27:39Z"}, "number": 18002, "public": true, "service_key": "appengine", "service_name": "Google App Engine", "severity": "medium", "updates": [{"created": "2018-02-11T03:27:39Z", "modified": "2018-02-11T03:27:39Z", "text": "The issue with reduced URLFetch availability has been resolved for all affected projects as of 2018-02-10 18:55 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence. We will provide a more detailed analysis of this incident once we have completed our internal investigation.", "when": "2018-02-11T03:27:39Z"}, {"created": "2018-02-11T02:58:37Z", "modified": "2018-02-11T02:58:37Z", "text": "We are investigating an issue with Google App Engine reduced URLFetch availability starting  at 17:15pm US/Pacific. We are currently rolling out a configuration change to mitigate this issue. We will provide another status update by 2018-02-10 19:30 US/Pacific with current details.", "when": "2018-02-11T02:58:37Z"}, {"created": "2018-02-11T02:32:37Z", "modified": "2018-02-11T02:32:37Z", "text": "We are investigating an issue with Google App Engine reduced URLFetch availability starting at 17:15pm PT. We will provide more information by 19:00 US/Pacific.", "when": "2018-02-11T02:32:37Z"}], "uri": "/incident/appengine/18002"}, {"begin": "2018-02-06T19:09:14Z", "created": "2018-02-06T19:09:15Z", "end": "2018-02-06T19:49:52Z", "external_desc": "We are investigating an issue with Google Container Engine that is affecting cluster creation and upgrade. We will provide more information by 11:45 US/Pacific.", "modified": "2018-02-06T19:49:52Z", "most-recent-update": {"created": "2018-02-06T19:49:51Z", "modified": "2018-02-06T19:49:51Z", "text": "The issue with cluster creation and upgrade has been resolved for all affected projects as of 11:35 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-02-06T19:49:51Z"}, "number": 18003, "public": true, "service_key": "container-engine", "service_name": "Google Kubernetes Engine", "severity": "medium", "updates": [{"created": "2018-02-06T19:49:51Z", "modified": "2018-02-06T19:49:51Z", "text": "The issue with cluster creation and upgrade has been resolved for all affected projects as of 11:35 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-02-06T19:49:51Z"}, {"created": "2018-02-06T19:09:16Z", "modified": "2018-02-06T19:09:16Z", "text": "We are investigating an issue with Google Container Engine that is affecting cluster creation and upgrade. We will provide more information by 11:45 US/Pacific.", "when": "2018-02-06T19:09:16Z"}], "uri": "/incident/container-engine/18003"}, {"begin": "2018-02-02T16:00:33Z", "created": "2018-02-02T16:00:34Z", "end": "2018-02-02T18:18:08Z", "external_desc": "We are investigating an issue with Google Stackdriver Trace API.", "modified": "2018-02-02T18:18:08Z", "most-recent-update": {"created": "2018-02-02T18:18:07Z", "modified": "2018-02-02T18:18:07Z", "text": "The issue with elevated failure rates in the Stackdriver Trace API has been resolved for all affected projects as of Friday, 2018-02-02 09:53 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-02-02T18:18:07Z"}, "number": 18002, "public": true, "service_key": "google-stackdriver", "service_name": "Google Stackdriver", "severity": "high", "updates": [{"created": "2018-02-02T18:18:07Z", "modified": "2018-02-02T18:18:07Z", "text": "The issue with elevated failure rates in the Stackdriver Trace API has been resolved for all affected projects as of Friday, 2018-02-02 09:53 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-02-02T18:18:07Z"}, {"created": "2018-02-02T17:38:17Z", "modified": "2018-02-02T17:38:17Z", "text": "Stackdriver Trace API continues to exhibit an elevated rate of request failures. Our engineering teams have put a mitigation in place, and will proceed to address the cause of the issue. We will provide another update by 10:45 AM US/Pacific.", "when": "2018-02-02T17:38:17Z"}, {"created": "2018-02-02T16:58:20Z", "modified": "2018-02-02T16:58:20Z", "text": "StackDriver Trace API is experiencing an elevated rate of request failures. There is no workaround at this time. Our engineering teams have identified the cause of the issue and are working on a mitigation. We will provide another update by 9:30 US/Pacific.", "when": "2018-02-02T16:58:20Z"}, {"created": "2018-02-02T16:00:35Z", "modified": "2018-02-02T16:00:35Z", "text": "We are investigating an issue with Google Stackdriver Trace API. We will provide more information by 09:00 US/Pacific.", "when": "2018-02-02T16:00:35Z"}], "uri": "/incident/google-stackdriver/18002"}, {"begin": "2018-02-01T02:20:48Z", "created": "2018-02-01T04:50:48Z", "end": "2018-02-01T03:50:50Z", "external_desc": "We experienced an issue with Google Cloud Storage, beginning at approximately 18:30 US/Pacific. The situation has been completely resolved by 20:08 PST.", "modified": "2018-02-07T18:06:02Z", "most-recent-update": {"created": "2018-02-07T18:06:01Z", "modified": "2018-02-07T18:06:01Z", "text": "On Wednesday 31 January 2017, some Google Cloud services experienced elevated errors and latency on operations that required inter-data center network traffic for a duration of 72 minutes. The impact was visible during three windows: between 18:20 and 19:08 PST, between 19:10 and 19:29, and again between 19:45 and 19:50. Network traffic between the public internet and Google's data centers was not affected by this incident.\r\n\r\nThe root cause of this incident was an error in a configuration update to the system that allocates network capacity for traffic between Google data centers. \r\n\r\nTo prevent a recurrence, we will improve the automated checks that we run on configuration changes to detect problems before release. We will be improving the monitoring of the canary to detect problems before global rollout of changes to the configuration.\r\n", "when": "2018-02-07T18:06:01Z"}, "number": 18002, "public": true, "service_key": "storage", "service_name": "Google Cloud Storage", "severity": "medium", "updates": [{"created": "2018-02-07T18:06:01Z", "modified": "2018-02-07T18:06:01Z", "text": "On Wednesday 31 January 2017, some Google Cloud services experienced elevated errors and latency on operations that required inter-data center network traffic for a duration of 72 minutes. The impact was visible during three windows: between 18:20 and 19:08 PST, between 19:10 and 19:29, and again between 19:45 and 19:50. Network traffic between the public internet and Google's data centers was not affected by this incident.\r\n\r\nThe root cause of this incident was an error in a configuration update to the system that allocates network capacity for traffic between Google data centers. \r\n\r\nTo prevent a recurrence, we will improve the automated checks that we run on configuration changes to detect problems before release. We will be improving the monitoring of the canary to detect problems before global rollout of changes to the configuration.\r\n", "when": "2018-02-07T18:06:01Z"}, {"created": "2018-02-01T04:50:50Z", "modified": "2018-02-01T04:50:50Z", "text": "We experienced an issue with Google Cloud Storage, beginning at approximately 18:30 US/Pacific. The situation has been completely resolved by 20:08 PST.", "when": "2018-02-01T04:50:50Z"}, {"created": "2018-02-01T04:50:49Z", "modified": "2018-02-01T04:50:49Z", "text": "", "when": "2018-02-01T04:50:49Z"}], "uri": "/incident/storage/18002"}, {"begin": "2018-02-01T02:20:47Z", "created": "2018-02-01T03:52:47Z", "end": "2018-02-01T03:50:30Z", "external_desc": "The issue with Google Compute Engine has been resolved for all affected projects as of 20:30 US/Pacific.", "modified": "2018-02-07T18:05:27Z", "most-recent-update": {"created": "2018-02-07T18:05:27Z", "modified": "2018-02-07T18:05:27Z", "text": "On Wednesday 31 January 2017, some Google Cloud services experienced elevated errors and latency on operations that required inter-data center network traffic for a duration of 72 minutes. The impact was visible during three windows: between 18:20 and 19:08 PST, between 19:10 and 19:29, and again between 19:45 and 19:50. Network traffic between the public internet and Google's data centers was not affected by this incident.\r\n\r\nThe root cause of this incident was an error in a configuration update to the system that allocates network capacity for traffic between Google data centers. \r\n\r\nTo prevent a recurrence, we will improve the automated checks that we run on configuration changes to detect problems before release. We will be improving the monitoring of the canary to detect problems before global rollout of changes to the configuration.\r\n", "when": "2018-02-07T18:05:27Z"}, "number": 18001, "public": true, "service_key": "compute", "service_name": "Google Compute Engine", "severity": "medium", "updates": [{"created": "2018-02-07T18:05:27Z", "modified": "2018-02-07T18:05:27Z", "text": "On Wednesday 31 January 2017, some Google Cloud services experienced elevated errors and latency on operations that required inter-data center network traffic for a duration of 72 minutes. The impact was visible during three windows: between 18:20 and 19:08 PST, between 19:10 and 19:29, and again between 19:45 and 19:50. Network traffic between the public internet and Google's data centers was not affected by this incident.\r\n\r\nThe root cause of this incident was an error in a configuration update to the system that allocates network capacity for traffic between Google data centers. \r\n\r\nTo prevent a recurrence, we will improve the automated checks that we run on configuration changes to detect problems before release. We will be improving the monitoring of the canary to detect problems before global rollout of changes to the configuration.\r\n", "when": "2018-02-07T18:05:27Z"}, {"created": "2018-02-01T04:53:29Z", "modified": "2018-02-01T04:53:29Z", "text": "The issue with Google Compute Engine has been resolved for all affected projects as of 20:30 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.\r\nFor everyone who is affected, we apologize for the disruption.", "when": "2018-02-01T04:53:29Z"}, {"created": "2018-02-01T04:23:56Z", "modified": "2018-02-01T04:23:56Z", "text": "The issue with Google Compute Engine should be resolved for the majority of projects and we expect a full resolution in the near future.\r\nWe will provide another status update by 20:45 US/Pacific with current details.", "when": "2018-02-01T04:23:56Z"}, {"created": "2018-02-01T03:52:49Z", "modified": "2018-02-01T03:52:49Z", "text": "We are investigating an issue with Google Compute Engine. We will provide more information by 20:30 US/Pacific.", "when": "2018-02-01T03:52:49Z"}, {"created": "2018-02-01T03:52:48Z", "modified": "2018-02-01T03:52:48Z", "text": "", "when": "2018-02-01T03:52:48Z"}], "uri": "/incident/compute/18001"}, {"begin": "2018-02-01T02:20:44Z", "created": "2018-02-01T03:33:45Z", "end": "2018-02-01T03:50:20Z", "external_desc": "We are investigating an issue with Google Kubernetes Engine. We will provide more information by 20:15 US/Pacific.", "modified": "2018-02-07T18:06:26Z", "most-recent-update": {"created": "2018-02-07T18:06:26Z", "modified": "2018-02-07T18:06:26Z", "text": "On Wednesday 31 January 2017, some Google Cloud services experienced elevated errors and latency on operations that required inter-data center network traffic for a duration of 72 minutes. The impact was visible during three windows: between 18:20 and 19:08 PST, between 19:10 and 19:29, and again between 19:45 and 19:50. Network traffic between the public internet and Google's data centers was not affected by this incident.\r\n\r\nThe root cause of this incident was an error in a configuration update to the system that allocates network capacity for traffic between Google data centers. \r\n\r\nTo prevent a recurrence, we will improve the automated checks that we run on configuration changes to detect problems before release. We will be improving the monitoring of the canary to detect problems before global rollout of changes to the configuration.\r\n", "when": "2018-02-07T18:06:26Z"}, "number": 18002, "public": true, "service_key": "container-engine", "service_name": "Google Kubernetes Engine", "severity": "medium", "updates": [{"created": "2018-02-07T18:06:26Z", "modified": "2018-02-07T18:06:26Z", "text": "On Wednesday 31 January 2017, some Google Cloud services experienced elevated errors and latency on operations that required inter-data center network traffic for a duration of 72 minutes. The impact was visible during three windows: between 18:20 and 19:08 PST, between 19:10 and 19:29, and again between 19:45 and 19:50. Network traffic between the public internet and Google's data centers was not affected by this incident.\r\n\r\nThe root cause of this incident was an error in a configuration update to the system that allocates network capacity for traffic between Google data centers. \r\n\r\nTo prevent a recurrence, we will improve the automated checks that we run on configuration changes to detect problems before release. We will be improving the monitoring of the canary to detect problems before global rollout of changes to the configuration.\r\n", "when": "2018-02-07T18:06:26Z"}, {"created": "2018-02-01T04:35:19Z", "modified": "2018-02-01T04:35:19Z", "text": "The issue with Google Kubernetes Engine has been resolved for all affected projects as of 20:33 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.\r\nFor everyone who is affected, we apologize for the disruption.", "when": "2018-02-01T04:35:19Z"}, {"created": "2018-02-01T04:20:26Z", "modified": "2018-02-01T04:20:26Z", "text": "The issue with Google Kubernetes Engine should be resolved for the majority of projects and we expect a full resolution in the near future.\r\nWe will provide another status update by 20:45 US/Pacific with current details.", "when": "2018-02-01T04:20:26Z"}, {"created": "2018-02-01T03:33:45Z", "modified": "2018-02-01T03:33:45Z", "text": "We are investigating an issue with Google Container Engine. We will provide more information by 20:15 US/Pacific.", "when": "2018-02-01T03:33:45Z"}], "uri": "/incident/container-engine/18002"}, {"begin": "2018-02-01T02:20:26Z", "created": "2018-02-01T03:28:28Z", "end": "2018-02-01T03:50:55Z", "external_desc": "The issue with Google Stackdriver Logging has been resolved for all affected projects as of 20:02 US/Pacific.", "modified": "2018-02-07T18:06:58Z", "most-recent-update": {"created": "2018-02-07T18:06:58Z", "modified": "2018-02-07T18:06:58Z", "text": "On Wednesday 31 January 2017, some Google Cloud services experienced elevated errors and latency on operations that required inter-data center network traffic for a duration of 72 minutes. The impact was visible during three windows: between 18:20 and 19:08 PST, between 19:10 and 19:29, and again between 19:45 and 19:50. Network traffic between the public internet and Google's data centers was not affected by this incident.\r\n\r\nThe root cause of this incident was an error in a configuration update to the system that allocates network capacity for traffic between Google data centers. \r\n\r\nTo prevent a recurrence, we will improve the automated checks that we run on configuration changes to detect problems before release. We will be improving the monitoring of the canary to detect problems before global rollout of changes to the configuration.\r\n", "when": "2018-02-07T18:06:58Z"}, "number": 18001, "public": true, "service_key": "google-stackdriver", "service_name": "Google Stackdriver", "severity": "medium", "updates": [{"created": "2018-02-07T18:06:58Z", "modified": "2018-02-07T18:06:58Z", "text": "On Wednesday 31 January 2017, some Google Cloud services experienced elevated errors and latency on operations that required inter-data center network traffic for a duration of 72 minutes. The impact was visible during three windows: between 18:20 and 19:08 PST, between 19:10 and 19:29, and again between 19:45 and 19:50. Network traffic between the public internet and Google's data centers was not affected by this incident.\r\n\r\nThe root cause of this incident was an error in a configuration update to the system that allocates network capacity for traffic between Google data centers. \r\n\r\nTo prevent a recurrence, we will improve the automated checks that we run on configuration changes to detect problems before release. We will be improving the monitoring of the canary to detect problems before global rollout of changes to the configuration.\r\n", "when": "2018-02-07T18:06:58Z"}, {"created": "2018-02-01T04:49:55Z", "modified": "2018-02-01T04:49:55Z", "text": "The issue with Google Stackdriver Logging has been resolved for all affected projects as of 20:02 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.\r\nFor everyone who is affected, we apologize for the disruption.", "when": "2018-02-01T04:49:55Z"}, {"created": "2018-02-01T04:23:08Z", "modified": "2018-02-01T04:23:08Z", "text": "The issue with Google Stackdriver Logging should be resolved for the majority of projects and we expect a full resolution in the near future.\r\nWe will provide another status update by 20:45 US/Pacific with current details.", "when": "2018-02-01T04:23:08Z"}, {"created": "2018-02-01T03:28:35Z", "modified": "2018-02-01T03:28:35Z", "text": "We are investigating an issue with Google Stackdriver Logging. We will provide more information by 20:00 US/Pacific.", "when": "2018-02-01T03:28:35Z"}], "uri": "/incident/google-stackdriver/18001"}, {"begin": "2018-02-01T02:20:03Z", "created": "2018-02-01T03:52:04Z", "end": "2018-02-01T03:50:44Z", "external_desc": "The issue with Google App Engine services has been resolved for all affected projects as of 21:00 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "modified": "2018-02-07T18:05:02Z", "most-recent-update": {"created": "2018-02-07T18:05:02Z", "modified": "2018-02-07T18:05:02Z", "text": "On Wednesday 31 January 2017, some Google Cloud services experienced elevated errors and latency on operations that required inter-data center network traffic for a duration of 72 minutes. The impact was visible during three windows: between 18:20 and 19:08 PST, between 19:10 and 19:29, and again between 19:45 and 19:50. Network traffic between the public internet and Google's data centers was not affected by this incident.\r\n\r\nThe root cause of this incident was an error in a configuration update to the system that allocates network capacity for traffic between Google data centers. \r\n\r\nTo prevent a recurrence, we will improve the automated checks that we run on configuration changes to detect problems before release. We will be improving the monitoring of the canary to detect problems before global rollout of changes to the configuration.\r\n", "when": "2018-02-07T18:05:02Z"}, "number": 18001, "public": true, "service_key": "appengine", "service_name": "Google App Engine", "severity": "medium", "updates": [{"created": "2018-02-07T18:05:02Z", "modified": "2018-02-07T18:05:02Z", "text": "On Wednesday 31 January 2017, some Google Cloud services experienced elevated errors and latency on operations that required inter-data center network traffic for a duration of 72 minutes. The impact was visible during three windows: between 18:20 and 19:08 PST, between 19:10 and 19:29, and again between 19:45 and 19:50. Network traffic between the public internet and Google's data centers was not affected by this incident.\r\n\r\nThe root cause of this incident was an error in a configuration update to the system that allocates network capacity for traffic between Google data centers. \r\n\r\nTo prevent a recurrence, we will improve the automated checks that we run on configuration changes to detect problems before release. We will be improving the monitoring of the canary to detect problems before global rollout of changes to the configuration.\r\n", "when": "2018-02-07T18:05:02Z"}, {"created": "2018-02-01T04:59:44Z", "modified": "2018-02-01T04:59:44Z", "text": "The issue with Google App Engine has been resolved for all affected projects as of 21:00 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-02-01T04:59:44Z"}, {"created": "2018-02-01T04:38:28Z", "modified": "2018-02-01T04:38:28Z", "text": "Google App Engine has mostly recovered and our engineering team is working to completely mitigate the issue. We will provide an update by 21:00 US/Pacific with current details.", "when": "2018-02-01T04:38:28Z"}, {"created": "2018-02-01T04:06:14Z", "modified": "2018-02-01T04:06:14Z", "text": "Google App Engine is recovering and our engineering team is working to completely mitigate the issue. We will provide an update by 20:30 US/Pacific with current details.", "when": "2018-02-01T04:06:14Z"}, {"created": "2018-02-01T03:52:04Z", "modified": "2018-02-01T03:52:04Z", "text": "We are experiencing an issue with multiple Google Cloud Platform Services, beginning at approximately 18:30 US/Pacific. The situation is getting improved for the most of products, but some products are still reporting errors. For everyone who is affected, we apologize for the disruption. We will provide an update by 20:00 US/Pacific with current details.", "when": "2018-02-01T03:52:04Z"}], "uri": "/incident/appengine/18001"}, {"begin": "2018-01-25T19:05:00Z", "created": "2018-01-25T20:09:18Z", "end": "2018-01-25T20:30:28Z", "external_desc": "We are investigating an issue with BigQuery in the US region. We will provide more information by 13:00 US/Pacific.", "modified": "2018-01-25T23:57:59Z", "most-recent-update": {"created": "2018-01-25T20:36:28Z", "modified": "2018-01-25T20:36:28Z", "text": "The issue with BigQuery error rates has been resolved for all affected users as of 12:30 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-01-25T20:36:28Z"}, "number": 18034, "public": true, "service_key": "bigquery", "service_name": "Google BigQuery", "severity": "medium", "updates": [{"created": "2018-01-25T20:36:28Z", "modified": "2018-01-25T20:36:28Z", "text": "The issue with BigQuery error rates has been resolved for all affected users as of 12:30 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-01-25T20:36:28Z"}, {"created": "2018-01-25T20:09:18Z", "modified": "2018-01-25T20:38:42Z", "text": "We are investigating an issue with Google BigQuery in the US region. We will provide more information by 12:45 US/Pacific.", "when": "2018-01-25T20:09:18Z"}], "uri": "/incident/bigquery/18034"}, {"begin": "2018-01-25T19:05:00Z", "created": "2018-01-25T20:21:03Z", "end": "2018-01-25T20:24:41Z", "external_desc": "We are investigating an issue with Google Cloud Storage in the US region. We will provide more information by 13:00 US/Pacific.", "modified": "2018-01-25T23:56:17Z", "most-recent-update": {"created": "2018-01-25T23:48:50Z", "modified": "2018-01-25T23:48:50Z", "text": "We have updated our estimated impact to an average of 2% and a peak of 3.6% GCS global error rate, based on a more thorough review of monitoring data and server logs. The initial estimate of impact was based on a internal assessment hosted in a single region; subsequent investigation reveals that Google's redundancy and rerouting infrastructure worked as intended and dramatically reduced the user-visible impact of the event to GCS's global user base. The 2% average error rate is measured over the duration of the event, from its beginning at 11:05 PST to its conclusion at 12:24 PST.", "when": "2018-01-25T23:48:50Z"}, "number": 18001, "public": true, "service_key": "storage", "service_name": "Google Cloud Storage", "severity": "medium", "updates": [{"created": "2018-01-25T23:48:50Z", "modified": "2018-01-25T23:48:50Z", "text": "We have updated our estimated impact to an average of 2% and a peak of 3.6% GCS global error rate, based on a more thorough review of monitoring data and server logs. The initial estimate of impact was based on a internal assessment hosted in a single region; subsequent investigation reveals that Google's redundancy and rerouting infrastructure worked as intended and dramatically reduced the user-visible impact of the event to GCS's global user base. The 2% average error rate is measured over the duration of the event, from its beginning at 11:05 PST to its conclusion at 12:24 PST.", "when": "2018-01-25T23:48:50Z"}, {"created": "2018-01-25T20:56:41Z", "modified": "2018-01-25T20:56:41Z", "text": "The issue with Google Cloud Storage error rates has been resolved for all affected users as of 12:30 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-01-25T20:56:41Z"}, {"created": "2018-01-25T20:36:11Z", "modified": "2018-01-25T20:36:11Z", "text": "The issue with Google Cloud Storage in the US regions should be resolved for the majority of users and we expect a full resolution in the near future. We will provide another status update by 2018-01-25 13:00 US/Pacific with current details.", "when": "2018-01-25T20:36:11Z"}, {"created": "2018-01-25T20:21:03Z", "modified": "2018-01-25T20:21:03Z", "text": "We are experiencing an issue with Google Cloud Storage beginning Thursday, 2018-01-25 11:23 US/Pacific. Current investigation indicates that approximately 100% of customers in the US region are affected and we expect that for affected users the service is mostly or entirely unavailable at this time. For everyone who is affected, we apologize for the disruption. We will provide an update by 2018-01-25 13:00 US/Pacific with current details.", "when": "2018-01-25T20:21:03Z"}], "uri": "/incident/storage/18001"}, {"begin": "2018-01-25T19:03:00Z", "created": "2018-01-25T20:21:56Z", "end": "2018-01-25T23:27:23Z", "external_desc": "We are investigating an issue with Google Cloud Networking. We will provide more information by 13:00 US/Pacific.", "modified": "2018-02-01T23:49:49Z", "most-recent-update": {"created": "2018-02-01T23:48:42Z", "modified": "2018-02-01T23:48:42Z", "text": "On Thursday 25 January 2018, while expanding the Google network serving the us-central1 region, a configuration change unexpectedly triggered packet loss and reduced network bandwidth for inter-region data transfer and replication traffic to and from the region. The network impact was observable during two windows, between 11:03 and 12:40 PST, and again between 14:27 and 15:27 PST. The principal user-visible impact was a degradation in the performance of some Google Cloud services that require cross data center traffic. There was no impact to network traffic between the us-central1 region and the internet, or to traffic between Compute Engine VM instances.\r\n\r\nWe sincerely apologize for the impact of this incident on your application or service. We have performed a detailed analysis of root cause and taken careful steps to ensure that this type of incident will not recur.\r\n", "when": "2018-02-01T23:48:42Z"}, "number": 18004, "public": true, "service_key": "cloud-networking", "service_name": "Google Cloud Networking", "severity": "medium", "updates": [{"created": "2018-02-01T23:48:42Z", "modified": "2018-02-01T23:48:42Z", "text": "On Thursday 25 January 2018, while expanding the Google network serving the us-central1 region, a configuration change unexpectedly triggered packet loss and reduced network bandwidth for inter-region data transfer and replication traffic to and from the region. The network impact was observable during two windows, between 11:03 and 12:40 PST, and again between 14:27 and 15:27 PST. The principal user-visible impact was a degradation in the performance of some Google Cloud services that require cross data center traffic. There was no impact to network traffic between the us-central1 region and the internet, or to traffic between Compute Engine VM instances.\r\n\r\nWe sincerely apologize for the impact of this incident on your application or service. We have performed a detailed analysis of root cause and taken careful steps to ensure that this type of incident will not recur.\r\n", "when": "2018-02-01T23:48:42Z"}, {"created": "2018-01-25T21:18:23Z", "modified": "2018-01-25T21:18:23Z", "text": "The issue with Google Cloud Networking has been resolved for all affected users as of 2018-01-25 13:15 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence. We will provide a more detailed analysis of this incident once we have completed our internal investigation.", "when": "2018-01-25T21:18:23Z"}, {"created": "2018-01-25T21:11:16Z", "modified": "2018-01-25T21:11:16Z", "text": "The issue with Google Cloud Networking should be resolved for the majority of users and we expect a full resolution in the near future. We will provide another status update by 2018-01-25 14:00 US/Pacific with current details.", "when": "2018-01-25T21:11:16Z"}, {"created": "2018-01-25T20:21:56Z", "modified": "2018-01-25T20:21:56Z", "text": "We are investigating an issue with Google Cloud Networking. We will provide more information by 13:00 US/Pacific.", "when": "2018-01-25T20:21:56Z"}], "uri": "/incident/cloud-networking/18004"}, {"begin": "2018-01-18T17:52:48Z", "created": "2018-01-18T19:23:49Z", "end": "2018-01-18T19:26:18Z", "external_desc": "We are investigating an issue with Google Cloud Networking affecting connectivity in us-central1 and europe-west3. We will provide more information by 12:15pm US/Pacific.", "modified": "2018-02-17T00:42:51Z", "most-recent-update": {"created": "2018-02-17T00:42:51Z", "modified": "2018-02-17T00:42:51Z", "text": "ISSUE SUMMARY\r\n\r\nOn Sunday 18 January 2018, Google Compute Engine networking experienced a network programming failure.  The two impacts of this incident included the autoscaler not scaling instance groups, as well as migrated and newly-created VMs not communicating with VMs in other zones for a duration of up to 93 minutes. We apologize for the impact this event had on your applications and projects, and we will carefully investigate the causes and implement measures to prevent recurrences.\r\n\r\nDETAILED DESCRIPTION OF IMPACT\r\n\r\nOn Sunday 18 January 2018, Google Compute Engine network provisioning updates failed in the following zones: \r\neurope-west3-a for 34 minutes (09:52 AM to 10:21 AM PT)\r\nus-central1-b for 79 minutes (09:57 AM to 11:16 AM PT)\r\nasia-northeast1-a for 93 minutes (09:53 AM to 11:26 AM PT)\r\n\r\nPropagation of Google Compute Engine networking configuration for newly created and migrated VMs is handled by two components. The first is responsible for providing a complete list of VM\u2019s, networks, firewall rules, and scaling decisions.  The second component provides a stream of updates for the components in a specific zone. \r\n\r\nDuring the affected period, the first component failed to return data.  VMs in the affected zones were unable to communicate with newly-created or migrated VMs in another zone in the same private GCE network. VMs in the same zone were unaffected because they are updated by the streaming component.\r\n\r\nThe autoscaler service also relies upon data from the failed first component to scale instance groups; without updates from that component, it could not make scaling decisions for the affected zones.\r\n\r\n\r\nROOT CAUSE\r\n\r\nA stuck process failed to provide updates to the Compute Engine control plane.  Automatic failover was unable to force-stop the process, and required manual failover to restore normal operation.\r\n\r\nREMEDIATION AND PREVENTION\r\n\r\nThe engineering team was alerted when the propagation of network configuration information stalled.  They manually failed over to the replacement task to restore normal operation of the data persistence layer.\r\n\r\nTo prevent another occurrence of this incident, we are taking the following actions:\r\nWe still stop VM migrations if the configuration data is stale.\r\nModify the data persistence layer to re-resolve their peers during long-running processes, to allow failover to replacement tasks.\r\n", "when": "2018-02-17T00:42:51Z"}, "number": 18003, "public": true, "service_key": "cloud-networking", "service_name": "Google Cloud Networking", "severity": "medium", "updates": [{"created": "2018-02-17T00:42:51Z", "modified": "2018-02-17T00:42:51Z", "text": "ISSUE SUMMARY\r\n\r\nOn Sunday 18 January 2018, Google Compute Engine networking experienced a network programming failure.  The two impacts of this incident included the autoscaler not scaling instance groups, as well as migrated and newly-created VMs not communicating with VMs in other zones for a duration of up to 93 minutes. We apologize for the impact this event had on your applications and projects, and we will carefully investigate the causes and implement measures to prevent recurrences.\r\n\r\nDETAILED DESCRIPTION OF IMPACT\r\n\r\nOn Sunday 18 January 2018, Google Compute Engine network provisioning updates failed in the following zones: \r\neurope-west3-a for 34 minutes (09:52 AM to 10:21 AM PT)\r\nus-central1-b for 79 minutes (09:57 AM to 11:16 AM PT)\r\nasia-northeast1-a for 93 minutes (09:53 AM to 11:26 AM PT)\r\n\r\nPropagation of Google Compute Engine networking configuration for newly created and migrated VMs is handled by two components. The first is responsible for providing a complete list of VM\u2019s, networks, firewall rules, and scaling decisions.  The second component provides a stream of updates for the components in a specific zone. \r\n\r\nDuring the affected period, the first component failed to return data.  VMs in the affected zones were unable to communicate with newly-created or migrated VMs in another zone in the same private GCE network. VMs in the same zone were unaffected because they are updated by the streaming component.\r\n\r\nThe autoscaler service also relies upon data from the failed first component to scale instance groups; without updates from that component, it could not make scaling decisions for the affected zones.\r\n\r\n\r\nROOT CAUSE\r\n\r\nA stuck process failed to provide updates to the Compute Engine control plane.  Automatic failover was unable to force-stop the process, and required manual failover to restore normal operation.\r\n\r\nREMEDIATION AND PREVENTION\r\n\r\nThe engineering team was alerted when the propagation of network configuration information stalled.  They manually failed over to the replacement task to restore normal operation of the data persistence layer.\r\n\r\nTo prevent another occurrence of this incident, we are taking the following actions:\r\nWe still stop VM migrations if the configuration data is stale.\r\nModify the data persistence layer to re-resolve their peers during long-running processes, to allow failover to replacement tasks.\r\n", "when": "2018-02-17T00:42:51Z"}, {"created": "2018-01-18T20:01:17Z", "modified": "2018-01-18T20:01:17Z", "text": "The issue with Google Cloud Networking connectivity has been resolved for all affected zones in europe-west3, us-central1, and asia-northeast1 as of 11:26am US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence. We will provide a more detailed analysis of this incident once we have completed our internal investigation.", "when": "2018-01-18T20:01:17Z"}, {"created": "2018-01-18T19:23:50Z", "modified": "2018-01-18T19:23:50Z", "text": "We are investigating an issue with Google Cloud Networking affecting connectivity in us-central1 and europe-west3. We will provide more information by 12:15pm US/Pacific.", "when": "2018-01-18T19:23:50Z"}], "uri": "/incident/cloud-networking/18003"}, {"begin": "2018-01-09T20:38:48Z", "created": "2018-01-09T20:38:49Z", "end": "2018-01-09T21:47:21Z", "external_desc": "We have resolved an issue with Google Cloud Networking.", "modified": "2018-01-09T21:47:22Z", "most-recent-update": {"created": "2018-01-09T21:47:20Z", "modified": "2018-01-09T21:47:20Z", "text": "The issue with packet loss from North and South America regions to Asia regions has been resolved for all affected users as of 1:45 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-01-09T21:47:20Z"}, "number": 18002, "public": true, "service_key": "cloud-networking", "service_name": "Google Cloud Networking", "severity": "high", "updates": [{"created": "2018-01-09T21:47:20Z", "modified": "2018-01-09T21:47:20Z", "text": "The issue with packet loss from North and South America regions to Asia regions has been resolved for all affected users as of 1:45 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-01-09T21:47:20Z"}, {"created": "2018-01-09T21:17:47Z", "modified": "2018-01-09T21:17:47Z", "text": "Our Engineering Team believes they have identified the root cause of the packet loss and is working to mitigate.", "when": "2018-01-09T21:17:47Z"}, {"created": "2018-01-09T20:55:51Z", "modified": "2018-01-09T20:55:51Z", "text": "We are experiencing an issue with packet loss from Google North and South America regions to Asia regions beginning at 12:30 US/Pacific. Current data indicates that approximately 40% of packets  are affected by this issue. For everyone who is affected, we apologize for the disruption. We will provide an update by  1:30 US/Pacific with current details.", "when": "2018-01-09T20:55:51Z"}, {"created": "2018-01-09T20:38:50Z", "modified": "2018-01-09T20:38:50Z", "text": "We are investigating an issue with Google Cloud Networking. We will provide more information by 13:15 US/Pacific.", "when": "2018-01-09T20:38:50Z"}], "uri": "/incident/cloud-networking/18002"}, {"begin": "2018-01-08T14:22:00Z", "created": "2018-01-08T15:19:45Z", "end": "2018-01-08T16:27:33Z", "external_desc": "Some GKE 'create/delete' cluster operations failing", "modified": "2018-01-08T17:39:35Z", "most-recent-update": {"created": "2018-01-08T17:38:32Z", "modified": "2018-01-08T17:38:32Z", "text": "The issue with GKE 'create/delete' cluster operations failing has been resolved for all affected\r\nusers as of 8:27 US/Pacific. We will conduct\r\nan internal investigation of this issue and make appropriate improvements to our\r\nsystems to help prevent or minimize future recurrence.", "when": "2018-01-08T17:38:32Z"}, "number": 18001, "public": true, "service_key": "container-engine", "service_name": "Google Kubernetes Engine", "severity": "medium", "updates": [{"created": "2018-01-08T17:38:32Z", "modified": "2018-01-08T17:38:32Z", "text": "The issue with GKE 'create/delete' cluster operations failing has been resolved for all affected\r\nusers as of 8:27 US/Pacific. We will conduct\r\nan internal investigation of this issue and make appropriate improvements to our\r\nsystems to help prevent or minimize future recurrence.", "when": "2018-01-08T17:38:32Z"}, {"created": "2018-01-08T17:37:55Z", "modified": "2018-01-08T17:37:55Z", "text": "The issue with GKE 'create/delete' cluster operations failing should be resolved for the majority of users. We will continue monitoring to confirm full resolution. We will provide more information by 11:30 US/Pacific.", "when": "2018-01-08T17:37:55Z"}, {"created": "2018-01-08T16:32:30Z", "modified": "2018-01-08T16:32:30Z", "text": "The issue with GKE 'create/delete' cluster operations failing should be resolved for the majority of users. We will continue monitoring to confirm full resolution. We will provide more information by 09:30 US/Pacific.", "when": "2018-01-08T16:32:30Z"}, {"created": "2018-01-08T16:00:10Z", "modified": "2018-01-08T16:00:10Z", "text": "We are still investigating an issue with some GKE 'create/delete' cluster operations failing. We will provide more information by 08:30 US/Pacific.", "when": "2018-01-08T16:00:10Z"}, {"created": "2018-01-08T15:19:45Z", "modified": "2018-01-08T15:19:45Z", "text": "We are investigating an issue with GKE 'create cluster' operations failing. We will provide more information by 08:00 US/Pacific.", "when": "2018-01-08T15:19:45Z"}], "uri": "/incident/container-engine/18001"}, {"begin": "2018-01-03T23:34:00Z", "created": "2018-01-03T23:34:01Z", "end": "2018-01-04T04:37:27Z", "external_desc": "The issue with Google Cloud Load Balancing (GCLB) creation has been resolved for all affected projects as of 20:22 US/Pacific.", "modified": "2018-01-04T04:37:27Z", "most-recent-update": {"created": "2018-01-04T04:37:27Z", "modified": "2018-01-04T04:37:27Z", "text": "The issue with Google Cloud Load Balancing (GCLB) creation has been resolved for all affected projects as of 20:22 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-01-04T04:37:27Z"}, "number": 18001, "public": true, "service_key": "cloud-networking", "service_name": "Google Cloud Networking", "severity": "high", "updates": [{"created": "2018-01-04T04:37:27Z", "modified": "2018-01-04T04:37:27Z", "text": "The issue with Google Cloud Load Balancing (GCLB) creation has been resolved for all affected projects as of 20:22 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2018-01-04T04:37:27Z"}, {"created": "2018-01-04T04:00:16Z", "modified": "2018-01-04T04:00:16Z", "text": "We are investigating an issue with Google Cloud Networking that is preventing the creation of new GCLB load balancers and updating of existing ones. We will provide more information by 20:30 US/Pacific.", "when": "2018-01-04T04:00:16Z"}, {"created": "2018-01-04T03:32:02Z", "modified": "2018-01-04T03:32:02Z", "text": "We are investigating an issue with Google Cloud Networking that is preventing the creation of new GCLB load balancers and updating of existing ones. We will provide more information by 20:00 US/Pacific.", "when": "2018-01-04T03:32:02Z"}, {"created": "2018-01-04T03:02:26Z", "modified": "2018-01-04T03:02:26Z", "text": "We are investigating an issue with Google Cloud Networking that is preventing the creation of new GCLB load balancers and updating of existing ones. We will provide more information by 19:30 US/Pacific.", "when": "2018-01-04T03:02:26Z"}, {"created": "2018-01-04T02:31:40Z", "modified": "2018-01-04T02:31:40Z", "text": "We are investigating an issue with Google Cloud Networking that is preventing the creation of new GCLB load balancers and updating of existing ones. We will provide more information by 19:00 US/Pacific.", "when": "2018-01-04T02:31:40Z"}, {"created": "2018-01-04T02:02:14Z", "modified": "2018-01-04T02:02:14Z", "text": "We are investigating an issue with Google Cloud Networking that is preventing the creation of new GCLB load balancers and updating of existing ones. We will provide more information by 18:30 US/Pacific.", "when": "2018-01-04T02:02:14Z"}, {"created": "2018-01-04T01:31:44Z", "modified": "2018-01-04T01:31:44Z", "text": "We are investigating an issue with Google Cloud Networking that is preventing the creation of new GCLB load balancers and updating of existing ones. We will provide more information by 18:00 US/Pacific.", "when": "2018-01-04T01:31:44Z"}, {"created": "2018-01-04T01:01:57Z", "modified": "2018-01-04T01:01:57Z", "text": "We are investigating an issue with Google Cloud Networking that is preventing the creation of new GCLB load balancers and updating of existing ones. We will provide more information by 17:30 US/Pacific.", "when": "2018-01-04T01:01:57Z"}, {"created": "2018-01-04T00:31:02Z", "modified": "2018-01-04T00:31:02Z", "text": "We are investigating an issue with Google Cloud Networking that is preventing the creation of new GCLB load balancers and updating of existing ones. We will provide more information by 17:00 US/Pacific.", "when": "2018-01-04T00:31:02Z"}, {"created": "2018-01-04T00:01:17Z", "modified": "2018-01-04T00:01:17Z", "text": "We are investigating an issue with Google Cloud Networking that is preventing the creation of new GCLB load balancers and updating of existing ones. We will provide more information by 16:30 US/Pacific.", "when": "2018-01-04T00:01:17Z"}, {"created": "2018-01-03T23:34:02Z", "modified": "2018-01-03T23:34:02Z", "text": "We are investigating an issue with Google Cloud Networking that is preventing the creation of new GCLB load balancers and updating of existing ones. We will provide more information by 16:00 US/Pacific.", "when": "2018-01-03T23:34:02Z"}], "uri": "/incident/cloud-networking/18001"}, {"begin": "2018-01-03T20:45:46Z", "created": "2018-01-03T22:50:47Z", "end": "2018-01-03T21:26:59Z", "external_desc": "Cloud Spanner issues from 12:45 to 13:26 Pacific time have been resolved.", "modified": "2018-01-04T21:17:22Z", "most-recent-update": {"created": "2018-01-03T23:00:58Z", "modified": "2018-01-03T23:00:58Z", "text": "Cloud Spanner issues from 12:45 to 13:26 Pacific time have been resolved.", "when": "2018-01-03T23:00:58Z"}, "number": 18001, "public": true, "service_key": "cloud-spanner", "service_name": "Cloud Spanner", "severity": "high", "updates": [{"created": "2018-01-03T23:00:58Z", "modified": "2018-01-03T23:00:58Z", "text": "Cloud Spanner issues from 12:45 to 13:26 Pacific time have been resolved.", "when": "2018-01-03T23:00:58Z"}, {"created": "2018-01-03T22:50:50Z", "modified": "2018-01-03T22:50:50Z", "text": "An incident with Cloud Spanner availability started at 12:45 Pacific time and has been addressed. The service is restored for all customers as of 13:26. Another update will be posted before 15:00 Pacific time to confirm the service health.", "when": "2018-01-03T22:50:50Z"}, {"created": "2018-01-03T22:50:48Z", "modified": "2018-01-03T22:50:48Z", "text": "", "when": "2018-01-03T22:50:48Z"}], "uri": "/incident/cloud-spanner/18001"}, {"begin": "2017-12-15T18:49:18Z", "created": "2017-12-15T18:49:19Z", "end": "2017-12-15T19:59:25Z", "external_desc": "The issue with Cloud Machine Learning Engine's Create Version has been resolved for all affected users as of 2017-12-15 10:55 US/Pacific.", "modified": "2017-12-15T19:59:25Z", "most-recent-update": {"created": "2017-12-15T19:59:24Z", "modified": "2017-12-15T19:59:24Z", "text": "The issue with Cloud Machine Learning Engine's Create Version has been resolved for all affected users as of 2017-12-15 10:55 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2017-12-15T19:59:24Z"}, "number": 17002, "public": true, "service_key": "cloud-ml", "service_name": "Cloud Machine Learning", "severity": "high", "updates": [{"created": "2017-12-15T19:59:24Z", "modified": "2017-12-15T19:59:24Z", "text": "The issue with Cloud Machine Learning Engine's Create Version has been resolved for all affected users as of 2017-12-15 10:55 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2017-12-15T19:59:24Z"}, {"created": "2017-12-15T19:19:22Z", "modified": "2017-12-15T19:19:22Z", "text": "The issue with Cloud Machine Learning Engine's Create Version should be resolved for the majority of users and we expect a full resolution in the near future. We will provide another status update by 12:00 US/Pacific with current details.", "when": "2017-12-15T19:19:22Z"}], "uri": "/incident/cloud-ml/17002"}, {"begin": "2017-12-14T19:59:33Z", "created": "2017-12-14T19:59:34Z", "end": "2017-12-14T20:28:57Z", "external_desc": "The issue with the Google App Engine Admin API has been resolved for all affected users as of Thursday, 2017-12-14 12:15 US/Pacific.", "modified": "2017-12-14T20:28:57Z", "most-recent-update": {"created": "2017-12-14T20:28:56Z", "modified": "2017-12-14T20:28:56Z", "text": "The issue with the Google App Engine Admin API has been resolved for all affected users as of Thursday, 2017-12-14 12:15 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2017-12-14T20:28:56Z"}, "number": 17009, "public": true, "service_key": "appengine", "service_name": "Google App Engine", "severity": "high", "updates": [{"created": "2017-12-14T20:28:56Z", "modified": "2017-12-14T20:28:56Z", "text": "The issue with the Google App Engine Admin API has been resolved for all affected users as of Thursday, 2017-12-14 12:15 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2017-12-14T20:28:56Z"}], "uri": "/incident/appengine/17009"}, {"begin": "2017-11-30T22:15:46Z", "created": "2017-11-30T22:16:04Z", "end": "2017-12-01T02:04:28Z", "external_desc": "We are investigating an issue with Google Cloud Storage. We will provide more information by 18:00 US/Pacific.", "modified": "2017-12-01T02:04:28Z", "most-recent-update": {"created": "2017-12-01T02:04:27Z", "modified": "2017-12-01T02:04:27Z", "text": "The issue with Cloud Storage elevated error rate has been resolved for all affected projects as of Friday 2017-11-30 16:10 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2017-12-01T02:04:27Z"}, "number": 17006, "public": true, "service_key": "storage", "service_name": "Google Cloud Storage", "severity": "medium", "updates": [{"created": "2017-12-01T02:04:27Z", "modified": "2017-12-01T02:04:27Z", "text": "The issue with Cloud Storage elevated error rate has been resolved for all affected projects as of Friday 2017-11-30 16:10 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2017-12-01T02:04:27Z"}, {"created": "2017-11-30T23:47:55Z", "modified": "2017-12-01T01:41:39Z", "text": "The issue with Cloud Storage elevated error rate issue should be resolved for\r\nthe majority of projects and we expect a full resolution in the near future.\r\nWe will provide another status update by 2017-11-30 18:00 US/Pacific\r\nwith current details.", "when": "2017-11-30T23:47:55Z"}, {"created": "2017-11-30T23:19:28Z", "modified": "2017-11-30T23:19:28Z", "text": "The Cloud Storage service is experiencing less than 10% error rate. We will provide another status update by 2017-11-30 16:30 US/Pacific with current details.", "when": "2017-11-30T23:19:28Z"}, {"created": "2017-11-30T23:16:54Z", "modified": "2017-11-30T23:16:54Z", "text": "The Cloud Storage service is experiencing less than 10% error rate. We will provide another status update by 2017-11-30 16:30 US/Pacific with current details.", "when": "2017-11-30T23:16:54Z"}, {"created": "2017-11-30T22:17:21Z", "modified": "2017-11-30T22:17:21Z", "text": "The Cloud Storage service is experiencing less than 10% error rate. We will provide another status update by 2017-11-30 15:00 US/Pacific with current details.", "when": "2017-11-30T22:17:21Z"}, {"created": "2017-11-30T22:16:17Z", "modified": "2017-11-30T22:16:17Z", "text": "The Cloud Storage service is experiencing less than 10% error rate. We will provide another status update by YYYY-mm-dd HH:MM US/Pacific with current details.", "when": "2017-11-30T22:16:17Z"}], "uri": "/incident/storage/17006"}, {"begin": "2017-11-21T20:14:42Z", "created": "2017-11-21T20:15:11Z", "end": "2017-11-21T20:15:18Z", "external_desc": "From 10:58 to 11:57 US/Pacific, GCE VM instances experienced packet loss from GCE instances to the Internet. The issue has been mitigated for all affected projects.", "modified": "2017-11-21T20:16:15Z", "most-recent-update": {"created": "2017-11-21T20:15:17Z", "modified": "2017-11-21T20:15:17Z", "text": "From 10:58 to 11:57 US/Pacific, GCE VM instances experienced packet loss from GCE instances to the Internet. The issue has been mitigated for all affected projects.", "when": "2017-11-21T20:15:17Z"}, "number": 17006, "public": true, "service_key": "cloud-networking", "service_name": "Google Cloud Networking", "severity": "high", "updates": [{"created": "2017-11-21T20:15:17Z", "modified": "2017-11-21T20:15:17Z", "text": "From 10:58 to 11:57 US/Pacific, GCE VM instances experienced packet loss from GCE instances to the Internet. The issue has been mitigated for all affected projects.", "when": "2017-11-21T20:15:17Z"}], "uri": "/incident/cloud-networking/17006"}, {"begin": "2017-11-17T14:17:45Z", "created": "2017-11-17T14:18:40Z", "end": "2017-11-17T15:23:00Z", "external_desc": "The issue with Google Cloud Engine VM instances losing connectivity has been resolved for all affected users as of Friday, 2017-11-17 7:17am US/Pacific.", "modified": "2017-11-17T15:23:03Z", "most-recent-update": {"created": "2017-11-17T15:22:59Z", "modified": "2017-11-17T15:22:59Z", "text": "The issue with Google Cloud Engine VM instances losing connectivity has been resolved for all affected users as of Friday, 2017-11-17 7:17am US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2017-11-17T15:22:59Z"}, "number": 17004, "public": true, "service_key": "cloud-networking", "service_name": "Google Cloud Networking", "severity": "high", "updates": [{"created": "2017-11-17T15:22:59Z", "modified": "2017-11-17T15:22:59Z", "text": "The issue with Google Cloud Engine VM instances losing connectivity has been resolved for all affected users as of Friday, 2017-11-17 7:17am US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2017-11-17T15:22:59Z"}], "uri": "/incident/cloud-networking/17004"}, {"begin": "2017-11-08T10:40:00Z", "created": "2017-11-08T12:02:29Z", "end": "2017-11-08T11:40:00Z", "external_desc": "App Engine increasingly showing 5xx", "modified": "2017-11-08T14:29:25Z", "most-recent-update": {"created": "2017-11-08T14:29:25Z", "modified": "2017-11-08T14:29:25Z", "text": "The issue with App Engine has been resolved for all affected projects as of 4:30 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence. ", "when": "2017-11-08T14:30:00Z"}, "number": 17008, "public": true, "service_key": "appengine", "service_name": "Google App Engine", "severity": "medium", "updates": [{"created": "2017-11-08T14:29:25Z", "modified": "2017-11-08T14:29:25Z", "text": "The issue with App Engine has been resolved for all affected projects as of 4:30 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence. ", "when": "2017-11-08T14:30:00Z"}, {"created": "2017-11-08T13:05:45Z", "modified": "2017-11-08T13:05:45Z", "text": "The issue should be resolved for the majority of projects and affected End-users located in Europe sending requests to projects in any region. We will provide another status update by 06:00 US/Pacific with current details. ", "when": "2017-11-08T13:00:00Z"}, {"created": "2017-11-08T12:31:08Z", "modified": "2017-11-08T12:31:08Z", "text": "The issue also affected projects in other regions but should be resolved for the majority of projects. We will provide another status update by 05:00 US/Pacific with current details.", "when": "2017-11-08T12:30:00Z"}, {"created": "2017-11-08T12:02:29Z", "modified": "2017-11-08T12:02:29Z", "text": "We are investigating an issue with App Engine in Europe. The issue should be resolved for the majority of projects and we expect a full resolution in the near future. We will provide another status update by 04:30 US/Pacific with current details.", "when": "2017-11-08T12:02:29Z"}], "uri": "/incident/appengine/17008"}, {"begin": "2017-11-06T20:33:00Z", "created": "2017-11-06T21:11:39Z", "end": "2017-11-06T22:23:00Z", "external_desc": "The Memcache service has recovered from a disruption between 12:30 US/Pacific and 15:30 US/Pacific.", "modified": "2017-11-08T13:54:54Z", "most-recent-update": {"created": "2017-11-07T18:59:28Z", "modified": "2017-11-08T13:54:54Z", "text": "ISSUE SUMMARY\r\n\r\nOn Monday 6 November 2017, the App Engine Memcache service experienced unavailability for applications in all regions for 1 hour and 50 minutes.\r\n\r\nWe sincerely apologize for the impact of this incident on your application or service. We recognize the severity of this incident and will be undertaking a detailed review to fully understand the ways in which we must change our systems to prevent a recurrence. \r\n\r\nDETAILED DESCRIPTION OF IMPACT\r\n\r\nOn Monday 6 November 2017 from 12:33 to 14:23 PST, the App Engine Memcache service experienced unavailability for applications in all regions.\r\n\r\nSome customers experienced elevated Datastore latency and errors while Memcache was unavailable. At this time, we believe that all the Datastore issues were caused by surges of Datastore activity due to Memcache being unavailable. When Memcache failed, if an application sent a surge of Datastore operations to specific entities or key ranges, then Datastore may have experienced contention or hotspotting, as described in https://cloud.google.com/datastore/docs/best-practices#designing_for_scale. Datastore experienced elevated load on its servers when the outage ended due to a surge in traffic. Some applications in the US experienced elevated latency on gets between 14:23 and 14:31, and elevated latency on puts between 14:23 and 15:04. \r\n\r\nCustomers running Managed VMs experienced failures of all HTTP requests and App Engine API calls during this incident. Customers using App Engine Flexible Environment, which is the successor to Managed VMs, were not impacted.\r\n\r\nROOT CAUSE\r\n\r\nThe App Engine Memcache service requires a globally consistent view of the current serving datacenter for each application in order to guarantee strong consistency when traffic fails over to alternate datacenters. The configuration which maps applications to datacenters is stored in a global database.\r\n\r\nThe incident occurred when the specific database entity that holds the configuration became unavailable for both reads and writes following a configuration update. App Engine Memcache is designed in such a way that the configuration is considered invalid if it cannot be refreshed within 20 seconds. When the configuration could not be fetched by clients, Memcache became unavailable.\r\n\r\nREMEDIATION AND PREVENTION\r\n\r\nGoogle received an automated alert at 12:34. Following normal practices, our engineers immediately looked for recent changes that may have triggered the incident. At 12:59, we attempted to revert the latest change to the configuration file. This configuration rollback required an update to the configuration in the global database, which also failed. At 14:21, engineers were able to update the configuration by sending an update request with a sufficiently long deadline. This caused all replicas of the database to synchronize and allowed clients to read the mapping configuration.\r\n\r\nAs a temporary mitigation, we have reduced the number of readers of the global configuration, which avoids the contention during write and led to the unavailability during the incident. Engineering projects are already under way to regionalize this configuration and thereby limit the blast radius of similar failure patterns in the future.\r\n", "when": "2017-11-07T18:59:28Z"}, "number": 17007, "public": true, "service_key": "appengine", "service_name": "Google App Engine", "severity": "high", "updates": [{"created": "2017-11-07T18:59:28Z", "modified": "2017-11-08T13:54:54Z", "text": "ISSUE SUMMARY\r\n\r\nOn Monday 6 November 2017, the App Engine Memcache service experienced unavailability for applications in all regions for 1 hour and 50 minutes.\r\n\r\nWe sincerely apologize for the impact of this incident on your application or service. We recognize the severity of this incident and will be undertaking a detailed review to fully understand the ways in which we must change our systems to prevent a recurrence. \r\n\r\nDETAILED DESCRIPTION OF IMPACT\r\n\r\nOn Monday 6 November 2017 from 12:33 to 14:23 PST, the App Engine Memcache service experienced unavailability for applications in all regions.\r\n\r\nSome customers experienced elevated Datastore latency and errors while Memcache was unavailable. At this time, we believe that all the Datastore issues were caused by surges of Datastore activity due to Memcache being unavailable. When Memcache failed, if an application sent a surge of Datastore operations to specific entities or key ranges, then Datastore may have experienced contention or hotspotting, as described in https://cloud.google.com/datastore/docs/best-practices#designing_for_scale. Datastore experienced elevated load on its servers when the outage ended due to a surge in traffic. Some applications in the US experienced elevated latency on gets between 14:23 and 14:31, and elevated latency on puts between 14:23 and 15:04. \r\n\r\nCustomers running Managed VMs experienced failures of all HTTP requests and App Engine API calls during this incident. Customers using App Engine Flexible Environment, which is the successor to Managed VMs, were not impacted.\r\n\r\nROOT CAUSE\r\n\r\nThe App Engine Memcache service requires a globally consistent view of the current serving datacenter for each application in order to guarantee strong consistency when traffic fails over to alternate datacenters. The configuration which maps applications to datacenters is stored in a global database.\r\n\r\nThe incident occurred when the specific database entity that holds the configuration became unavailable for both reads and writes following a configuration update. App Engine Memcache is designed in such a way that the configuration is considered invalid if it cannot be refreshed within 20 seconds. When the configuration could not be fetched by clients, Memcache became unavailable.\r\n\r\nREMEDIATION AND PREVENTION\r\n\r\nGoogle received an automated alert at 12:34. Following normal practices, our engineers immediately looked for recent changes that may have triggered the incident. At 12:59, we attempted to revert the latest change to the configuration file. This configuration rollback required an update to the configuration in the global database, which also failed. At 14:21, engineers were able to update the configuration by sending an update request with a sufficiently long deadline. This caused all replicas of the database to synchronize and allowed clients to read the mapping configuration.\r\n\r\nAs a temporary mitigation, we have reduced the number of readers of the global configuration, which avoids the contention during write and led to the unavailability during the incident. Engineering projects are already under way to regionalize this configuration and thereby limit the blast radius of similar failure patterns in the future.\r\n", "when": "2017-11-07T18:59:28Z"}, {"created": "2017-11-06T23:55:45Z", "modified": "2017-11-06T23:55:45Z", "text": "The issue with Memcache availability has been resolved for all affected projects as of 15:30 US/Pacific. \r\nWe will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence. \r\nWe will provide a more detailed analysis of this incident once we have completed our internal investigation. \r\n\r\nThis is the final update for this incident.", "when": "2017-11-06T23:55:45Z"}, {"created": "2017-11-06T23:26:58Z", "modified": "2017-11-06T23:26:58Z", "text": "The Memcache service is still recovering from the outage. The rate of errors continues to decrease and we expect a full resolution of this incident in the near future.\r\n\r\nWe will provide an update by 16:00 US/Pacific with current details.", "when": "2017-11-06T23:26:58Z"}, {"created": "2017-11-06T23:08:37Z", "modified": "2017-11-06T23:08:37Z", "text": "The issue with Memcache and MVM availability should be resolved for the majority of projects and we expect a full resolution in the near future. \r\n\r\nWe will provide an update by 15:30 US/Pacific with current details.", "when": "2017-11-06T23:08:37Z"}, {"created": "2017-11-06T22:44:39Z", "modified": "2017-11-06T22:44:39Z", "text": "We are experiencing an issue with Memcache availability beginning at November 6, 2017 at 12:30 pm US/Pacific. \r\nAt this time we are gradually ramping up traffic to Memcache and we see that the rate of errors is decreasing. \r\nOther services affected by the outage, such as MVM instances, should be normalizing in the near future.\r\n\r\nWe will provide an update by 15:15 US/Pacific with current details.", "when": "2017-11-06T22:44:39Z"}, {"created": "2017-11-06T22:31:01Z", "modified": "2017-11-06T22:31:01Z", "text": "We are experiencing an issue with Memcache availability beginning at November 6, 2017 at 12:30 pm US/Pacific. \r\nOur Engineering Team believes they have identified the root cause of the errors and is working to mitigate. \r\n\r\nWe will provide an update by 15:00 US/Pacific with current details.", "when": "2017-11-06T22:31:01Z"}, {"created": "2017-11-06T21:57:32Z", "modified": "2017-11-06T21:57:32Z", "text": "We are experiencing an issue with Memcache availability beginning at November 6, 2017 at 12:30 pm US/Pacific. \r\nCurrent data indicates that all projects using Memcache are affected by this issue. \r\nFor everyone who is affected, we apologize for any inconvenience you may be experiencing.\r\n\r\nWe will provide an update by 14:30 US/Pacific with current details.", "when": "2017-11-06T21:57:32Z"}, {"created": "2017-11-06T21:31:35Z", "modified": "2017-11-06T21:31:35Z", "text": "We are experiencing an issue with Memcache availability beginning at November 6, 2017 at 12:30 pm US/Pacific. \r\nCurrent data indicate(s) that all projects using Memcache are affected by this issue. \r\nFor everyone who is affected, we apologize for any inconvenience you may be experiencing.\r\n\r\nWe will provide an update by 14:00 US/Pacific with current details.\r\n", "when": "2017-11-06T21:31:35Z"}, {"created": "2017-11-06T21:11:39Z", "modified": "2017-11-06T21:11:39Z", "text": "We are investigating an issue with Google App Engine and Memcache. We will provide more information by 13:30 US/Pacific.", "when": "2017-11-06T21:11:39Z"}], "uri": "/incident/appengine/17007"}, {"begin": "2017-10-31T00:00:00Z", "created": "2017-10-31T03:17:02Z", "end": "2017-10-31T03:45:00Z", "external_desc": "We are investigating an issue with Google Cloud SQL. We see failures for Cloud SQL connections from App Engine and connections using the Cloud SQL Proxy. We are also observing elevated failure rates for Cloud SQL admin activities.", "modified": "2017-10-31T04:03:21Z", "most-recent-update": {"created": "2017-10-31T04:01:43Z", "modified": "2017-10-31T04:01:43Z", "text": "The issue with Cloud SQL connectivity affecting connections from App Engine and connections using the Cloud SQL Proxy as well as the issue with Cloud SQL admin activities have been resolved for all affected as of 20:45 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2017-10-31T04:01:43Z"}, "number": 17018, "public": true, "service_key": "cloud-sql", "service_name": "Google Cloud SQL", "severity": "medium", "updates": [{"created": "2017-10-31T04:01:43Z", "modified": "2017-10-31T04:01:43Z", "text": "The issue with Cloud SQL connectivity affecting connections from App Engine and connections using the Cloud SQL Proxy as well as the issue with Cloud SQL admin activities have been resolved for all affected as of 20:45 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2017-10-31T04:01:43Z"}, {"created": "2017-10-31T03:35:03Z", "modified": "2017-10-31T03:35:03Z", "text": "We are continuing to experience an issue with Cloud SQL connectivity, affecting only connections from App Engine and connections using the Cloud SQL Proxy, beginning at 2017-10-30 17:00 US/Pacific. We are also observing elevated failure rates for Cloud SQL admin activities (using the Cloud SQL portion of the Cloud Console UI, using gcloud beta sql, directly using the Admin API, etc.). Our Engineering Team believes they have identified the root cause and mitigation effort is currently underway. For everyone who is affected, we apologize for any inconvenience you may be experiencing. We will provide another update by 2017-10-30 21:00 US/Pacific with current details.", "when": "2017-10-31T03:35:03Z"}], "uri": "/incident/cloud-sql/17018"}, {"begin": "2017-10-18T08:37:00Z", "created": "2017-10-18T08:37:38Z", "end": "2017-10-18T09:59:00Z", "external_desc": "Jobs not terminating", "modified": "2017-10-18T10:01:50Z", "most-recent-update": {"created": "2017-10-18T10:01:06Z", "modified": "2017-10-18T10:01:06Z", "text": "The issue with with Cloud Dataflow in which batch jobs are stuck and cannot be terminated has been resolved for all affected projects as of Wednesday, 201-10-18 02:58 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2017-10-18T10:01:06Z"}, "number": 17002, "public": true, "service_key": "cloud-dataflow", "service_name": "Google Cloud Dataflow", "severity": "medium", "updates": [{"created": "2017-10-18T10:01:06Z", "modified": "2017-10-18T10:01:06Z", "text": "The issue with with Cloud Dataflow in which batch jobs are stuck and cannot be terminated has been resolved for all affected projects as of Wednesday, 201-10-18 02:58 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2017-10-18T10:01:06Z"}, {"created": "2017-10-18T09:27:50Z", "modified": "2017-10-18T09:37:35Z", "text": "A fix for the issue with Cloud Dataflow in which batch jobs are stuck and cannot be terminated is currently getting rolled out. We expect a full resolution in the near future. \r\nWe will provide another status update by Wednesday, 2017-10-18 03:45 US/Pacific with current details. ", "when": "2017-10-18T09:27:50Z"}, {"created": "2017-10-18T08:45:45Z", "modified": "2017-10-18T08:45:45Z", "text": "We are experiencing an issue with Cloud Dataflow in which batch jobs are stuck and cannot be terminated. For everyone who is affected, we apologize for any inconvenience you may be experiencing. We will provide an update by Wednesday, 2017-10-18 02:45 US/Pacific with current details.", "when": "2017-10-18T08:45:45Z"}, {"created": "2017-10-18T08:37:38Z", "modified": "2017-10-18T08:37:38Z", "text": "We are investigating an issue with Cloud Dataflow. We will provide more information by 02:05 US/Pacific.", "when": "2017-10-18T08:37:38Z"}], "uri": "/incident/cloud-dataflow/17002"}, {"begin": "2017-10-16T19:44:00Z", "created": "2017-10-16T19:59:34Z", "end": "2017-10-16T20:08:00Z", "external_desc": "Stackdriver Uptime Check Alerts Not Firing", "modified": "2017-10-16T20:13:38Z", "most-recent-update": {"created": "2017-10-16T20:12:33Z", "modified": "2017-10-16T20:12:33Z", "text": "The issue with Stackdriver Uptime Check Alerts not firing has been resolved for all affected projects as of Monday, 2017-10-16 13:08 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2017-10-16T20:12:33Z"}, "number": 17007, "public": true, "service_key": "google-stackdriver", "service_name": "Google Stackdriver", "severity": "medium", "updates": [{"created": "2017-10-16T20:12:33Z", "modified": "2017-10-16T20:12:33Z", "text": "The issue with Stackdriver Uptime Check Alerts not firing has been resolved for all affected projects as of Monday, 2017-10-16 13:08 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2017-10-16T20:12:33Z"}, {"created": "2017-10-16T19:59:34Z", "modified": "2017-10-16T19:59:34Z", "text": "We are investigating an issue with Stackdriver Uptime Check Alerts.\r\nWe will provide more information by 13:15 US/Pacific.", "when": "2017-10-16T19:59:34Z"}], "uri": "/incident/google-stackdriver/17007"}, {"begin": "2017-10-12T19:47:00Z", "created": "2017-10-13T16:31:54Z", "end": "2017-10-13T17:12:00Z", "external_desc": "Elevated GCS Errors from Canada", "modified": "2017-10-19T19:49:30Z", "most-recent-update": {"created": "2017-10-19T19:49:30Z", "modified": "2017-10-19T19:49:30Z", "text": "ISSUE SUMMARY\r\n\r\nStarting Thursday 12 October 2017, Google Cloud Storage clients located in the Northeast of North America experienced up to a 10% error rate for a duration of 21 hours and 35 minutes when fetching objects stored in multi-regional buckets in the US.\r\n\r\nWe apologize for the impact of this incident on your application or service. The reliability of our service is a top priority and we understand that we need to do better to ensure that incidents of this type do not recur.\r\n\r\nDETAILED DESCRIPTION OF IMPACT\r\n\r\nBetween Thursday 12 October 2017 12:47 PDT and Friday 13 October 2017 10:12 PDT, Google Cloud Storage clients located in the Northeast of North America experienced up to a 10% rate of 503 errors and elevated latency. Some users experienced higher error rates for brief periods. This incident only impacted requests to fetch objects stored in multi-regional buckets in the US; clients were able to mitigate impact by retrying. The percentage of total global requests to Cloud Storage that experienced errors was 0.03%.\r\n\r\nROOT CAUSE\r\n\r\nGoogle ensures balanced use of its internal networks by throttling outbound traffic at the source host in the event of congestion. This incident was caused by a bug in an earlier version of the job that reads Cloud Storage objects from disk and streams data to clients. Under high traffic conditions, the bug caused these jobs to incorrectly throttle outbound network traffic even though the network was not congested.\r\n\r\nGoogle had previously identified this bug and was in the process of rolling out a fix to all Google datacenters. At the time of the incident, Cloud Storage jobs in a datacenter in Northeast North America that serves requests to some Canadian and US clients had not yet received the fix. This datacenter is not a location for customer buckets (https://cloud.google.com/storage/docs/bucket-locations), but objects in multi-regional buckets can be served from instances running in this datacenter in order to optimize latency for clients.\r\n\r\n\r\nREMEDIATION AND PREVENTION\r\n\r\nThe incident was first reported by a customer to Google on Thursday 12 October 14:59 PDT. Google engineers determined root cause on Friday 13 October 09:47 PDT. We redirected Cloud Storage traffic away from the impacted region at 10:08 and the incident was resolved at 10:12. \r\n\r\nWe have now rolled out the bug fix to all regions. We will also add external monitoring probes for all regional points of presence so that we can more quickly detect issues of this type.\r\n", "when": "2017-10-19T19:49:30Z"}, "number": 17005, "public": true, "service_key": "storage", "service_name": "Google Cloud Storage", "severity": "high", "updates": [{"created": "2017-10-19T19:49:30Z", "modified": "2017-10-19T19:49:30Z", "text": "ISSUE SUMMARY\r\n\r\nStarting Thursday 12 October 2017, Google Cloud Storage clients located in the Northeast of North America experienced up to a 10% error rate for a duration of 21 hours and 35 minutes when fetching objects stored in multi-regional buckets in the US.\r\n\r\nWe apologize for the impact of this incident on your application or service. The reliability of our service is a top priority and we understand that we need to do better to ensure that incidents of this type do not recur.\r\n\r\nDETAILED DESCRIPTION OF IMPACT\r\n\r\nBetween Thursday 12 October 2017 12:47 PDT and Friday 13 October 2017 10:12 PDT, Google Cloud Storage clients located in the Northeast of North America experienced up to a 10% rate of 503 errors and elevated latency. Some users experienced higher error rates for brief periods. This incident only impacted requests to fetch objects stored in multi-regional buckets in the US; clients were able to mitigate impact by retrying. The percentage of total global requests to Cloud Storage that experienced errors was 0.03%.\r\n\r\nROOT CAUSE\r\n\r\nGoogle ensures balanced use of its internal networks by throttling outbound traffic at the source host in the event of congestion. This incident was caused by a bug in an earlier version of the job that reads Cloud Storage objects from disk and streams data to clients. Under high traffic conditions, the bug caused these jobs to incorrectly throttle outbound network traffic even though the network was not congested.\r\n\r\nGoogle had previously identified this bug and was in the process of rolling out a fix to all Google datacenters. At the time of the incident, Cloud Storage jobs in a datacenter in Northeast North America that serves requests to some Canadian and US clients had not yet received the fix. This datacenter is not a location for customer buckets (https://cloud.google.com/storage/docs/bucket-locations), but objects in multi-regional buckets can be served from instances running in this datacenter in order to optimize latency for clients.\r\n\r\n\r\nREMEDIATION AND PREVENTION\r\n\r\nThe incident was first reported by a customer to Google on Thursday 12 October 14:59 PDT. Google engineers determined root cause on Friday 13 October 09:47 PDT. We redirected Cloud Storage traffic away from the impacted region at 10:08 and the incident was resolved at 10:12. \r\n\r\nWe have now rolled out the bug fix to all regions. We will also add external monitoring probes for all regional points of presence so that we can more quickly detect issues of this type.\r\n", "when": "2017-10-19T19:49:30Z"}, {"created": "2017-10-13T17:17:49Z", "modified": "2017-10-13T17:17:49Z", "text": "The issue with Google Cloud Storage request failures for users in Canada and Northeast North America has been resolved for all affected users as of Friday, 2017-10-13 10:08  US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence. We will provide a more detailed analysis of this incident once we have completed our internal investigation.\r\n", "when": "2017-10-13T17:08:00Z"}, {"created": "2017-10-13T16:31:54Z", "modified": "2017-10-13T16:31:54Z", "text": "We are investigating an issue with Google Cloud Storage users in Canada and Northeast North America experiencing HTTP 503 failures. We will provide more information by 10:30 US/Pacific.", "when": "2017-10-13T16:31:54Z"}], "uri": "/incident/storage/17005"}, {"begin": "2017-10-09T21:05:00Z", "created": "2017-10-09T21:57:40Z", "end": "2017-10-09T22:07:00Z", "external_desc": "Elevated Cloud IAM API errors", "modified": "2017-10-09T22:25:00Z", "most-recent-update": {"created": "2017-10-09T22:23:27Z", "modified": "2017-10-09T22:23:27Z", "text": "The issue with the Cloud IAM API has been resolved for all affected users as of Monday, 2017-10-09 15:07 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2017-10-09T22:23:27Z"}, "number": 17001, "public": true, "service_key": "cloud-iam", "service_name": "Identity & Security", "severity": "medium", "updates": [{"created": "2017-10-09T22:23:27Z", "modified": "2017-10-09T22:23:27Z", "text": "The issue with the Cloud IAM API has been resolved for all affected users as of Monday, 2017-10-09 15:07 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2017-10-09T22:23:27Z"}, {"created": "2017-10-09T22:13:50Z", "modified": "2017-10-09T22:13:50Z", "text": "The issue with the Cloud IAM API should be resolved for the majority of users and we expect a full resolution in the near future. We will provide another status update by 15:30 US/Pacific with current details.", "when": "2017-10-09T22:15:00Z"}, {"created": "2017-10-09T21:57:40Z", "modified": "2017-10-09T21:57:40Z", "text": "We are investigating an issue with the Cloud Identity & Access Management API. We will provide more information by 15:15 US/Pacific.", "when": "2017-10-09T21:57:40Z"}], "uri": "/incident/cloud-iam/17001"}, {"begin": "2017-10-06T20:19:00Z", "created": "2017-10-06T21:21:58Z", "end": "2017-10-06T21:31:00Z", "external_desc": "Elevated GCS errors in us-east1", "modified": "2017-10-06T21:59:53Z", "most-recent-update": {"created": "2017-10-06T21:59:13Z", "modified": "2017-10-06T21:59:13Z", "text": "The issue with GCS service has been resolved for all affected users as of 14:31 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our system to help prevent or minimize future recurrence.", "when": "2017-10-06T22:00:00Z"}, "number": 17004, "public": true, "service_key": "storage", "service_name": "Google Cloud Storage", "severity": "medium", "updates": [{"created": "2017-10-06T21:59:13Z", "modified": "2017-10-06T21:59:13Z", "text": "The issue with GCS service has been resolved for all affected users as of 14:31 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our system to help prevent or minimize future recurrence.", "when": "2017-10-06T22:00:00Z"}, {"created": "2017-10-06T21:36:19Z", "modified": "2017-10-06T21:36:19Z", "text": "The issue with GCS service should be resolved for the majority of users and we expect a full resolution in the near future. We will provide another status update by 15:00 US/Pacific with current details.", "when": "2017-10-06T21:36:00Z"}, {"created": "2017-10-06T21:21:58Z", "modified": "2017-10-06T21:21:58Z", "text": "We are investigating an issue that occurred with GCS starting at 13:19 PDT. We will provide more information by 14:30 US/Pacific.", "when": "2017-10-06T21:21:58Z"}], "uri": "/incident/storage/17004"}, {"begin": "2017-10-03T21:27:00Z", "created": "2017-10-03T21:58:51Z", "end": "2017-10-03T23:48:00Z", "external_desc": "Stackdriver console unavailable", "modified": "2017-10-03T23:50:06Z", "most-recent-update": {"created": "2017-10-03T23:48:16Z", "modified": "2017-10-03T23:48:16Z", "text": "The issue with Google Stackdriver has been resolved for all affected users as of 2017-10-03 16:28 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2017-10-03T23:48:16Z"}, "number": 17006, "public": true, "service_key": "google-stackdriver", "service_name": "Google Stackdriver", "severity": "high", "updates": [{"created": "2017-10-03T23:48:16Z", "modified": "2017-10-03T23:48:16Z", "text": "The issue with Google Stackdriver has been resolved for all affected users as of 2017-10-03 16:28 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2017-10-03T23:48:16Z"}, {"created": "2017-10-03T23:08:25Z", "modified": "2017-10-03T23:08:25Z", "text": "We are continuing to investigate the Google Stackdriver issue. Graphs are fully restored, but alerting policies and uptime checks are still degraded. We will provide another update at 17:00 US/Pacific.", "when": "2017-10-03T23:08:25Z"}, {"created": "2017-10-03T22:31:04Z", "modified": "2017-10-03T22:31:04Z", "text": "We are continuing to investigate the Google Stackdriver issue. In addition to graph and alerting policy unavailability, uptime checks are not completing successfully. We believe we have isolated the root cause and are working on a resolution, and we will provide another update at 16:00 US/Pacific.", "when": "2017-10-03T22:31:04Z"}, {"created": "2017-10-03T21:58:51Z", "modified": "2017-10-03T21:58:51Z", "text": "We are investigating an issue with Google Stackdriver that is causing charts and alerting policies to be unavailable.  We will provide more information by 15:30 US/Pacific.", "when": "2017-10-03T21:58:51Z"}], "uri": "/incident/google-stackdriver/17006"}, {"begin": "2017-10-03T18:41:00Z", "created": "2017-10-03T18:41:29Z", "end": "2017-10-04T05:30:00Z", "external_desc": "Project creation failure", "modified": "2017-10-04T05:46:19Z", "most-recent-update": {"created": "2017-10-04T05:45:57Z", "modified": "2017-10-04T05:45:57Z", "text": "The issue with Project Creation failing with \"Unknown error\" has been resolved for all affected users as of Tuesday, 2017-10-03 22:30 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.\r\n", "when": "2017-10-04T05:45:57Z"}, "number": 17007, "public": true, "service_key": "developers-console", "service_name": "Google Cloud Console", "severity": "medium", "updates": [{"created": "2017-10-04T05:45:57Z", "modified": "2017-10-04T05:45:57Z", "text": "The issue with Project Creation failing with \"Unknown error\" has been resolved for all affected users as of Tuesday, 2017-10-03 22:30 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.\r\n", "when": "2017-10-04T05:45:57Z"}, {"created": "2017-10-03T23:24:44Z", "modified": "2017-10-03T23:24:44Z", "text": "The investigation and mitigation work is currently underway by our Engineering Team. We will provide another status update by Wednesday, 2017-10-04 9:30AM US/Pacific with current details.", "when": "2017-10-03T23:25:00Z"}, {"created": "2017-10-03T19:32:14Z", "modified": "2017-10-03T19:32:14Z", "text": "The project creation is experiencing a 100% error rate on requests. We will provide another status update by Tuesday, 2017-10-03 16:30 US/Pacific with current details.", "when": "2017-10-03T19:35:00Z"}, {"created": "2017-10-03T18:41:29Z", "modified": "2017-10-03T18:41:29Z", "text": "We are investigating an issue with Project creation. We will provide more information by 12:40PM US/Pacific.", "when": "2017-10-03T18:41:29Z"}], "uri": "/incident/developers-console/17007"}, {"begin": "2017-09-26T18:59:00Z", "created": "2017-09-29T20:37:25Z", "end": "2017-09-29T22:35:00Z", "external_desc": "Errors creating new Stackdriver accounts and adding new projects to existing Stackdriver accounts.", "modified": "2017-09-29T22:49:14Z", "most-recent-update": {"created": "2017-09-29T22:47:25Z", "modified": "2017-09-29T22:47:25Z", "text": "The issue with Google Stackdriver has been resolved for all affected projects as of Friday, 2017-09-29 15:35 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2017-09-29T22:47:25Z"}, "number": 17005, "public": true, "service_key": "google-stackdriver", "service_name": "Google Stackdriver", "severity": "medium", "updates": [{"created": "2017-09-29T22:47:25Z", "modified": "2017-09-29T22:47:25Z", "text": "The issue with Google Stackdriver has been resolved for all affected projects as of Friday, 2017-09-29 15:35 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2017-09-29T22:47:25Z"}, {"created": "2017-09-29T21:59:16Z", "modified": "2017-09-29T21:59:16Z", "text": "The Stackdriver issue is believed to be affecting less than 20% of project creation and update requests. Our Engineering Team is still working on isolating the problem. We will provide more information by 16:30 US/Pacific.", "when": "2017-09-29T21:59:16Z"}, {"created": "2017-09-29T20:37:26Z", "modified": "2017-09-29T20:37:26Z", "text": "We are investigating an issue with Google Stackdriver that affects creating new Stackdriver accounts and adding new projects to existing Stackdriver accounts. We will provide more information by 14:30 US/Pacific.", "when": "2017-09-29T20:37:26Z"}], "uri": "/incident/google-stackdriver/17005"}, {"begin": "2017-09-21T19:39:00Z", "created": "2017-09-21T19:39:46Z", "end": "2017-09-21T21:25:00Z", "external_desc": "Activity Stream not showing new Activity Logs", "modified": "2017-09-21T21:56:46Z", "most-recent-update": {"created": "2017-09-21T19:39:46Z", "modified": "2017-09-21T19:39:46Z", "text": "We are currently investigating an issue with the Cloud Console's Activity Stream not showing new Activity Logs.", "when": "2017-09-21T19:39:46Z"}, "number": 17006, "public": true, "service_key": "developers-console", "service_name": "Google Cloud Console", "severity": "low", "updates": [{"created": "2017-09-21T19:39:46Z", "modified": "2017-09-21T19:39:46Z", "text": "We are currently investigating an issue with the Cloud Console's Activity Stream not showing new Activity Logs.", "when": "2017-09-21T19:39:46Z"}], "uri": "/incident/developers-console/17006"}, {"begin": "2017-09-18T19:56:00Z", "created": "2017-09-18T19:56:21Z", "end": "2017-09-18T20:38:00Z", "external_desc": "GCLB Load Balancer Creation/Changes Impaired", "modified": "2017-09-18T20:47:53Z", "most-recent-update": {"created": "2017-09-18T20:45:48Z", "modified": "2017-09-18T20:45:48Z", "text": "The issue affecting load balancer configurations has been resolved as of 12:48 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.\r\n", "when": "2017-09-18T20:45:48Z"}, "number": 17003, "public": true, "service_key": "cloud-networking", "service_name": "Google Cloud Networking", "severity": "high", "updates": [{"created": "2017-09-18T20:45:48Z", "modified": "2017-09-18T20:45:48Z", "text": "The issue affecting load balancer configurations has been resolved as of 12:48 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.\r\n", "when": "2017-09-18T20:45:48Z"}, {"created": "2017-09-18T20:24:41Z", "modified": "2017-09-18T20:24:41Z", "text": "There have been several sets of correct configuration updates since a code reversion. There is a potential for some minor issues for a little longer, but the issue has been substantially resolved.", "when": "2017-09-18T20:24:41Z"}, {"created": "2017-09-18T19:56:21Z", "modified": "2017-09-18T19:56:21Z", "text": "We are investigating an issue with load balancer configurations impacting both changes to existing service and creation of services. We will provide more information by 13:30 US/Pacific.\r\n", "when": "2017-09-18T19:56:21Z"}], "uri": "/incident/cloud-networking/17003"}, {"begin": "2017-09-14T13:47:00Z", "created": "2017-09-14T13:48:25Z", "end": "2017-09-14T15:20:00Z", "external_desc": "Google Cloud Pub/Sub partially unavailable.", "modified": "2017-09-14T15:35:27Z", "most-recent-update": {"created": "2017-09-14T15:33:17Z", "modified": "2017-09-14T15:33:17Z", "text": "The issue with Pub/Sub subscription creation has been resolved for all affected projects as of 08:20 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2017-09-14T15:33:17Z"}, "number": 17003, "public": true, "service_key": "cloud-pubsub", "service_name": "Google Cloud Pub/Sub", "severity": "medium", "updates": [{"created": "2017-09-14T15:33:17Z", "modified": "2017-09-14T15:33:17Z", "text": "The issue with Pub/Sub subscription creation has been resolved for all affected projects as of 08:20 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2017-09-14T15:33:17Z"}, {"created": "2017-09-14T14:26:58Z", "modified": "2017-09-14T15:35:26Z", "text": "We are experiencing an issue with Pub/Sub subscription creation beginning at 2017-09-14 06:30 US/Pacific. Current data  indicates that approximately 12% of requests are affected by this issue. For everyone who is affected, we apologize for any inconvenience you may be experiencing. We will provide an update by 08:30 US/Pacific with current details.", "when": "2017-09-14T14:26:58Z"}, {"created": "2017-09-14T13:48:25Z", "modified": "2017-09-14T13:48:25Z", "text": "We are investigating an issue with Google Pub/Sub. We will provide more information by 07:15 US/Pacific.", "when": "2017-09-14T13:48:25Z"}], "uri": "/incident/cloud-pubsub/17003"}, {"begin": "2017-08-30T17:00:00Z", "created": "2017-08-30T18:06:00Z", "end": "2017-08-30T19:08:00Z", "external_desc": "We are investigating an issue with BigQuery queries failing starting at 10:15am PT", "modified": "2017-08-31T18:56:20Z", "most-recent-update": {"created": "2017-08-30T19:23:50Z", "modified": "2017-08-31T18:56:19Z", "text": "The issue with BigQuery queries failing has been resolved for all affected users as of 12:05pm US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2017-08-30T19:15:00Z"}, "number": 18033, "public": true, "service_key": "bigquery", "service_name": "Google BigQuery", "severity": "medium", "updates": [{"created": "2017-08-30T19:23:50Z", "modified": "2017-08-31T18:56:19Z", "text": "The issue with BigQuery queries failing has been resolved for all affected users as of 12:05pm US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence.", "when": "2017-08-30T19:15:00Z"}, {"created": "2017-08-30T18:17:15Z", "modified": "2017-08-30T18:17:15Z", "text": "The BigQuery service is experiencing a 16% error rate on queries. We will provide another status update by 12:00pm US/Pacific with current details.", "when": "2017-08-30T18:17:00Z"}, {"created": "2017-08-30T18:06:00Z", "modified": "2017-08-30T18:06:00Z", "text": "We are investigating an issue with BigQuery queries failing starting at 10:15am PT", "when": "2017-08-30T18:06:00Z"}], "uri": "/incident/bigquery/18033"}, {"begin": "2017-08-29T20:56:00Z", "created": "2017-08-30T07:52:41Z", "end": "2017-08-31T03:18:00Z", "external_desc": "Issue with Cloud Network Load Balancers connectivity", "modified": "2017-09-06T00:16:55Z", "most-recent-update": {"created": "2017-09-01T21:55:42Z", "modified": "2017-09-06T00:16:55Z", "text": "Revised Tuesday 05 September 2017 to clarify the impact and timing.\r\n\r\nISSUE SUMMARY\r\n\r\nFor portions of Tuesday 29 August and Wednesday 30 August 2017, some Google Compute Engine instances which were live migrated from one server to another stopped receiving network traffic from Google Cloud Network Load Balancers and Internal Load balancers. On average, less than 1% of GCE instances were affected by this behavior over the duration of the incident, and at its peak, 2% of instances were affected. For the 2% of instances which were ultimately affected, the mean duration of the impact was 9 hours and the maximum duration was 30 hours and 22 minutes. We apologize for the impact this had on your services. We are particularly cognizant of the unusual duration of the incident. We have completed an extensive postmortem to learn from the issue and improve Google Cloud Platform.\r\n\r\nDETAILED DESCRIPTION OF IMPACT\r\n\r\nAny GCE instance that was live-migrated between 13:56 PDT on Tuesday 29 August 2017 and 08:32 on Wednesday 30 August 2017 became unreachable via Google Cloud Network or Internal Load Balancing until between 08:56 and 14:18 (for regions other than us-central1) or 20:16 (for us-central1) on Wednesday. See https://goo.gl/NjqQ31 for a visual representation of the cumulative number of instances live-migrated over time.\r\n\r\nOur internal investigation shows that, at peak, 2% of GCE instances were affected by the issue.\r\n\r\nInstances which were not live-migrated during this period were not affected. In addition, instances that do not use Network Load Balancing or Internal Load Balancing were not affected. Related capabilities such as Google Cloud HTTP(S) Load Balancing, TCP and SSL Proxy Load Balancing and direct connectivity on instance internal and external IP addresses were unaffected.\r\n\r\nROOT CAUSE\r\n\r\nLive-migration transfers a running VM from one host machine to another host machine within the same zone. All VM properties and attributes remain unchanged, including internal and external IP addresses, instance metadata, block storage data and volumes, OS and application state, network settings, network connections, and so on.\r\n\r\nIn this case, a change in the internal representation of networking information in VM instances caused inconsistency between two values, both of which were supposed to hold the external and internal virtual IP addresses of load balancers. When an affected instance was live-migrated, the instance was deprogrammed from the load balancer because of the inconsistency.  This made it impossible for load balancers that used the instance as backend to look up the destination IP address of the instance following its migration, so traffic destined for that instance was not forwarded from the load balancer.\r\n\r\nREMEDIATION AND PREVENTION\r\n\r\nAt 08:32 Google engineers rolled back the triggering change, at which point no new live-migration would cause the issue.  At 08:56 they then started a process which fixed all mismatched network information; this process completed at 14:18 except for us-central1 which took until 20:18.\r\n\r\nIn order to prevent the issue from recurring, Google engineers are enhancing the automated canary testing that simulates live-migration events, increasing detection of load balancing packets loss, and enforcing more restrictions on new configuration changes deployment for internal representation changes.", "when": "2017-09-06T00:16:55Z"}, "number": 17002, "public": true, "service_key": "cloud-networking", "service_name": "Google Cloud Networking", "severity": "high", "updates": [{"created": "2017-09-01T21:55:42Z", "modified": "2017-09-06T00:16:55Z", "text": "Revised Tuesday 05 September 2017 to clarify the impact and timing.\r\n\r\nISSUE SUMMARY\r\n\r\nFor portions of Tuesday 29 August and Wednesday 30 August 2017, some Google Compute Engine instances which were live migrated from one server to another stopped receiving network traffic from Google Cloud Network Load Balancers and Internal Load balancers. On average, less than 1% of GCE instances were affected by this behavior over the duration of the incident, and at its peak, 2% of instances were affected. For the 2% of instances which were ultimately affected, the mean duration of the impact was 9 hours and the maximum duration was 30 hours and 22 minutes. We apologize for the impact this had on your services. We are particularly cognizant of the unusual duration of the incident. We have completed an extensive postmortem to learn from the issue and improve Google Cloud Platform.\r\n\r\nDETAILED DESCRIPTION OF IMPACT\r\n\r\nAny GCE instance that was live-migrated between 13:56 PDT on Tuesday 29 August 2017 and 08:32 on Wednesday 30 August 2017 became unreachable via Google Cloud Network or Internal Load Balancing until between 08:56 and 14:18 (for regions other than us-central1) or 20:16 (for us-central1) on Wednesday. See https://goo.gl/NjqQ31 for a visual representation of the cumulative number of instances live-migrated over time.\r\n\r\nOur internal investigation shows that, at peak, 2% of GCE instances were affected by the issue.\r\n\r\nInstances which were not live-migrated during this period were not affected. In addition, instances that do not use Network Load Balancing or Internal Load Balancing were not affected. Related capabilities such as Google Cloud HTTP(S) Load Balancing, TCP and SSL Proxy Load Balancing and direct connectivity on instance internal and external IP addresses were unaffected.\r\n\r\nROOT CAUSE\r\n\r\nLive-migration transfers a running VM from one host machine to another host machine within the same zone. All VM properties and attributes remain unchanged, including internal and external IP addresses, instance metadata, block storage data and volumes, OS and application state, network settings, network connections, and so on.\r\n\r\nIn this case, a change in the internal representation of networking information in VM instances caused inconsistency between two values, both of which were supposed to hold the external and internal virtual IP addresses of load balancers. When an affected instance was live-migrated, the instance was deprogrammed from the load balancer because of the inconsistency.  This made it impossible for load balancers that used the instance as backend to look up the destination IP address of the instance following its migration, so traffic destined for that instance was not forwarded from the load balancer.\r\n\r\nREMEDIATION AND PREVENTION\r\n\r\nAt 08:32 Google engineers rolled back the triggering change, at which point no new live-migration would cause the issue.  At 08:56 they then started a process which fixed all mismatched network information; this process completed at 14:18 except for us-central1 which took until 20:18.\r\n\r\nIn order to prevent the issue from recurring, Google engineers are enhancing the automated canary testing that simulates live-migration events, increasing detection of load balancing packets loss, and enforcing more restrictions on new configuration changes deployment for internal representation changes.", "when": "2017-09-06T00:16:55Z"}, {"created": "2017-08-31T04:03:02Z", "modified": "2017-08-31T04:03:02Z", "text": "The issue with Network Load Balancers has been resolved for all affected projects as of 20:18 US/Pacific. We will conduct an internal investigation of this issue and make appropriate improvements to our systems to help prevent or minimize future recurrence. We will provide a more detailed analysis of this incident once we have completed our internal investigation.", "when": "2017-08-31T03:18:00Z"}, {"created": "2017-08-31T02:18:19Z", "modified": "2017-08-31T02:18:19Z", "text": "The issue with Network Load Balancers should be resolved for all regions except for < 10% of affected Network Load Balancers in us-central1. The last few will be resolved in the upcoming hours. We will provide another status update by 21:00 US/Pacific with current details.", "when": "2017-08-31T02:18:19Z"}, {"created": "2017-08-30T23:54:08Z", "modified": "2017-08-30T23:54:08Z", "text": "The issue with Network Load Balancers should be resolved for all regions except for < 10% of affected Network Load Balancers in us-central1. The last few will be resolved in the upcoming hours. We will provide another status update by 19:00 US/Pacific with current details.", "when": "2017-08-30T23:54:08Z"}, {"created": "2017-08-30T22:58:09Z", "modified": "2017-08-30T22:58:09Z", "text": "The issue with Network Load Balancers should be resolved for all regions except us-central1, for which repairs are almost complete. We expect a full resolution in the next hour, and will provide another status update by 17:00 US/Pacific with current details.", "when": "2017-08-30T22:58:09Z"}, {"created": "2017-08-30T21:35:08Z", "modified": "2017-08-30T21:35:08Z", "text": "The issue with Network Load Balancers should be resolved for all regions except us-central1, for which repairs are ongoing. We expect a full resolution in the next few hours, and will provide another status update by 16:00 US/Pacific with current details.\r\n", "when": "2017-08-30T21:35:08Z"}, {"created": "2017-08-30T20:36:19Z", "modified": "2017-08-30T20:36:19Z", "text": "The issue with Network Load Balancers should be resolved for all regions are fixed except for us-central1, us-east1, and europe-west1. Those 3 are underway. We expect a full resolution in the next few hours. We will provide another status update by 16:00 US/Pacific with current details.\r\n\r\n", "when": "2017-08-30T20:36:19Z"}, {"created": "2017-08-30T19:03:30Z", "modified": "2017-08-30T19:03:30Z", "text": "We have identified all possibly affected instances and are currently testing the fix for these instances. We will be deploying the fix once it has been verified. No additional action is required. Performing the workaround mentioned previously will not cause any adverse effects.\r\n\r\nNext update at 14:00 US/Pacific", "when": "2017-08-30T19:00:00Z"}, {"created": "2017-08-30T18:02:42Z", "modified": "2017-08-30T18:07:41Z", "text": "We wanted to send another update with better formatting. We will provide more another update on resolving effected instances by 12 PDT.\r\n\r\nAffected customers can also mitigate their affected instances with the following procedure (which causes Network Load Balancer to be reprogrammed) using gcloud tool or via the Compute Engine API.\r\n\r\nNB: No modification to the existing load balancer configurations is necessary, but a temporary TargetPool needs to be created.\r\n\r\nCreate a new TargetPool.\r\nAdd the affected VMs in a region to the new TargetPool.\r\nWait for the VMs to start working in their existing load balancer configuration.\r\nDelete the new TargetPool.\r\nDO NOT delete the existing load balancer config, including the old target pool.\r\nIt is not necessary to create a new ForwardingRule.\r\n\r\nExample:\r\n\r\n1) gcloud compute target-pools create dummy-pool --project=&lt;your_project> --region=&lt;region>\r\n\r\n2) gcloud compute target-pools add-instances dummy-pool --instances=&lt;instance1,instance2,...>  --project=&lt;your_project> --region=&lt;region> --instances-zone=&lt;zone>\r\n\r\n3) (Wait)\r\n\r\n4) gcloud compute target-pools delete dummy-pool --project=&lt;your_project> --region=&lt;region>\r\n\r\n", "when": "2017-08-30T18:02:42Z"}, {"created": "2017-08-30T17:30:42Z", "modified": "2017-08-30T17:30:42Z", "text": "Our first mitigation has completed at this point and no new instances should be effected. We are slowly going through an fixing affected customers. Affected customers can also mitigate their affected instances with the following procedure (which causes Network Load Balancer to be reprogrammed) using gcloud tool or via the Compute Engine API.\r\n\r\nNB: No modification to the existing load balancer configurations is necessary, but a temporary TargetPool needs to be created.\r\n\r\nCreate a new TargetPool.\r\nAdd the affected VMs in a region to the new TargetPool.\r\nWait for the VMs to start working in their existing load balancer configuration.\r\nDelete the new TargetPool.\r\nDO NOT delete the existing load balancer config, including the old target pool.\r\nIt is not necessary to create a new ForwardingRule.\r\n\r\nExample:\r\ngcloud compute target-pools create dummy-pool --project=<your_project> --region=<region>\r\ngcloud compute target-pools add-instances dummy-pool --instances=<instance1,instance2,...>  --project=<your_project> --region=<region> --instances-zone=<zone>\r\n(Wait)\r\ngcloud compute target-pools delete dummy-pool --project=<your_project> --region=<region>\r\n\r\n", "when": "2017-08-30T17:30:42Z"}, {"created": "2017-08-30T16:30:12Z", "modified": "2017-08-30T16:30:12Z", "text": "We are experiencing an issue with a subset of Network Load Balance. The configuration change to mitigate this issue has been rolled out and we are working on further measures to completely resolve the issue. For everyone who is affected, we apologize for any inconvenience you may be experiencing. We will provide an update by 10:30 US/Pacific with current details.", "when": "2017-08-30T16:30:12Z"}, {"created": "2017-08-30T16:05:49Z", "modified": "2017-08-30T16:05:49Z", "text": "We are experiencing an issue with a subset of Network Load Balancer in regions us-east1, us-central1, europe-west1, asia-northeast1 and asia-east1 not being able to connect to backends. The configuration change to mitigate this issue has been rolled out and we are working on further measures to completly resolve the issue. For everyone who is affected, we apologize for any inconvenience you may be experiencing. We will provide an update by 09:30 US/Pacific with current details.", "when": "2017-08-30T16:00:00Z"}, {"created": "2017-08-30T15:33:09Z", "modified": "2017-08-30T15:34:33Z", "text": "We are experiencing an issue with a subset of Network Load Balancer in regions us-east1, us-central1, europe-west1, asia-northeast1 and asia-east1 not being able to connect to backends. We have identified the event that triggers this issue and are rolling back a configuration change to mitigate this issue. For everyone who is affected, we apologize for any inconvenience you may be experiencing. We will provide an update by 09:00 US/Pacific with current details.", "when": "2017-08-30T15:30:00Z"}, {"created": "2017-08-30T15:03:28Z", "modified": "2017-08-30T15:03:28Z", "text": "We are experiencing an issue with a subset of Network Load Balancer in regions us-east1, us-central1, europe-west1, asia-northeast1 and asia-east1 not being able to connect to backends. Mitigation work is still in progress. For everyone who is affected, we apologize for any inconvenience you may be experiencing. We will provide an update by 08:30 US/Pacific with current details.", "when": "2017-08-30T15:00:00Z"}, {"created": "2017-08-30T14:30:46Z", "modified": "2017-08-30T14:30:46Z", "text": "We are experiencing an issue with a subset of Network Load Balancer in regions us-east1, us-central1, europe-west1, asia-northeast1 and asia-east1 not being able to connect to backends. Mitigation work is still in progress. For everyone who is affected, we apologize for any inconvenience you may be experiencing. We will provide an update by 08:00 US/Pacific with current details.\r\n", "when": "2017-08-30T14:30:00Z"}, {"created": "2017-08-30T14:02:15Z", "modified": "2017-08-30T14:02:15Z", "text": "We are experiencing an issue with a subset of Network Load Balancer in regions us-east1, us-central1, europe-west1, asia-northeast1 and asia-east1 not being able to connect to backends.  Our previous actions did not resolve the issue. We are pursuing alternative solutions. For everyone who is affected, we apologize for any inconvenience you may be experiencing. We will provide an update by 07:30 US/Pacific with current details.", "when": "2017-08-30T14:00:00Z"}, {"created": "2017-08-30T13:30:28Z", "modified": "2017-08-30T13:30:28Z", "text": "We are experiencing an issue with a subset of Network Load Balancer in regions us-east1, us-central1, europe-west1, asia-northeast1 and asia-east1 not being able to connect to backends. Mitigation work is currently underway by our Engineering Team. For everyone who is affected, we apologize for any inconvenience you may be experiencing. We will provide an update by 07:00 US/Pacific with current details.", "when": "2017-08-30T13:30:00Z"}, {"created": "2017-08-30T12:30:57Z", "modified": "2017-08-30T13:24:29Z", "text": "We are experiencing an issue with a subset of Network Load Balancer in regions us-east1, us-central1,  europe-west1, asia-northeast1 and asia-east1 not being able to connect to backends. Our Engineering Team has determined the infrastructure component responsible for the issue and mitigation work is currently underway. For everyone who is affected, we apologize for any inconvenience you may be experiencing. We will provide an update by 06:30 US/Pacific with current details.", "when": "2017-08-30T13:00:00Z"}, {"created": "2017-08-30T13:12:03Z", "modified": "2017-08-30T13:24:08Z", "text": "We are experiencing an issue with a subset of Network Load Balancer in regions us-east1, us-central1,  europe-west1, asia-northeast1 and asia-east1 not being able to connect to backends. Our Engineering Team has reduced the scope of possible root causes and is still investigating. For everyone who is affected, we apologize for any inconvenience you may be experiencing. We will provide an update by 06:00 US/Pacific with current details.", "when": "2017-08-30T12:30:00Z"}, {"created": "2017-08-30T12:02:46Z", "modified": "2017-08-30T12:32:25Z", "text": "We are experiencing an intermittent issue with Network Load Balancer connectivity to their backends.\r\nThe investigation is still ongoing. \r\nFor everyone who is affected, we apologize for any inconvenience you may be experiencing. We will provide an update by 05:30 US/Pacific with current details.", "when": "2017-08-30T12:00:00Z"}, {"created": "2017-08-30T11:30:51Z", "modified": "2017-08-30T11:30:51Z", "text": "We are experiencing an intermittent issue with Network Load Balancer connectivity to their backends.\r\nThe investigation is still ongoing. \r\nFor everyone who is affected, we apologize for any inconvenience you may be experiencing. We will provide an update by 05:00 US/Pacific with current details.", "when": "2017-08-30T11:30:00Z"}, {"created": "2017-08-30T11:02:16Z", "modified": "2017-08-30T11:02:16Z", "text": "We are experiencing an intermittent issue with Network Load Balancer connectivity to their backends.\r\nWe have ruled out several possible failure scenarios. The investigation is still ongoing. \r\nFor everyone who is affected, we apologize for any inconvenience you may be experiencing. We will provide an update by 04:30 US/Pacific with current details.", "when": "2017-08-30T11:00:00Z"}, {"created": "2017-08-30T10:30:03Z", "modified": "2017-08-30T10:30:03Z", "text": "We are experiencing an intermittent issue with Network Load Balancer connectivity to their backends. For everyone who is affected, we apologize for any inconvenience you may be experiencing. We will provide an update by 04:00 US/Pacific with current details.", "when": "2017-08-30T10:30:03Z"}, {"created": "2017-08-30T10:00:10Z", "modified": "2017-08-30T10:00:10Z", "text": "We are experiencing an intermittent issue with Network Load Balancer connectivity to their backends. For everyone who is affected, we apologize for any inconvenience you may be experiencing. We will provide an update by 03:30 US/Pacific with current details.", "when": "2017-08-30T10:00:10Z"}, {"created": "2017-08-30T09:29:08Z", "modified": "2017-08-30T09:29:08Z", "text": "We are experiencing an intermittent issue with Network Load Balancer connectivity to their backends. For everyone who is affected, we apologize for any inconvenience you may be experiencing. We will provide an update by 03:00 US/Pacific with current details.\r\n", "when": "2017-08-30T09:30:00Z"}, {"created": "2017-08-30T08:50:55Z", "modified": "2017-08-30T08:50:55Z", "text": "We are investigating an issue with network load balancer connectivity. We will provide more information by 02:30 US/Pacific.", "when": "2017-08-30T08:50:55Z"}, {"created": "2017-08-30T08:20:07Z", "modified": "2017-08-30T08:20:07Z", "text": "We are investigating an issue with network connectivity. We will provide more information by 01:50 \r\n US/Pacific.\r\n", "when": "2017-08-30T08:20:07Z"}, {"created": "2017-08-30T07:52:41Z", "modified": "2017-08-30T07:52:41Z", "text": "We are investigating an issue with network connectivity. We will provide more information by 01:20 US/Pacific.", "when": "2017-08-30T07:52:41Z"}], "uri": "/incident/cloud-networking/17002"}]