Платформа ЦРНП "Мирокод" для разработки проектов
https://git.mirocod.ru
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
581 lines
18 KiB
581 lines
18 KiB
// Copyright 2015 Matthew Holt |
|
// |
|
// Licensed under the Apache License, Version 2.0 (the "License"); |
|
// you may not use this file except in compliance with the License. |
|
// You may obtain a copy of the License at |
|
// |
|
// http://www.apache.org/licenses/LICENSE-2.0 |
|
// |
|
// Unless required by applicable law or agreed to in writing, software |
|
// distributed under the License is distributed on an "AS IS" BASIS, |
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
// See the License for the specific language governing permissions and |
|
// limitations under the License. |
|
|
|
package certmagic |
|
|
|
import ( |
|
"context" |
|
"crypto/x509" |
|
"encoding/pem" |
|
"fmt" |
|
"log" |
|
"path" |
|
"runtime" |
|
"strings" |
|
"time" |
|
|
|
"go.uber.org/zap" |
|
"golang.org/x/crypto/ocsp" |
|
) |
|
|
|
// maintainAssets is a permanently-blocking function |
|
// that loops indefinitely and, on a regular schedule, checks |
|
// certificates for expiration and initiates a renewal of certs |
|
// that are expiring soon. It also updates OCSP stapling. It |
|
// should only be called once per cache. Panics are recovered, |
|
// and if panicCount < 10, the function is called recursively, |
|
// incrementing panicCount each time. Initial invocation should |
|
// start panicCount at 0. |
|
func (certCache *Cache) maintainAssets(panicCount int) { |
|
log := loggerNamed(certCache.logger, "maintenance") |
|
if log != nil { |
|
log = log.With(zap.String("cache", fmt.Sprintf("%p", certCache))) |
|
} |
|
|
|
defer func() { |
|
if err := recover(); err != nil { |
|
buf := make([]byte, stackTraceBufferSize) |
|
buf = buf[:runtime.Stack(buf, false)] |
|
if log != nil { |
|
log.Error("panic", zap.Any("error", err), zap.ByteString("stack", buf)) |
|
} |
|
if panicCount < 10 { |
|
certCache.maintainAssets(panicCount + 1) |
|
} |
|
} |
|
}() |
|
|
|
renewalTicker := time.NewTicker(certCache.options.RenewCheckInterval) |
|
ocspTicker := time.NewTicker(certCache.options.OCSPCheckInterval) |
|
|
|
if log != nil { |
|
log.Info("started background certificate maintenance") |
|
} |
|
|
|
ctx, cancel := context.WithCancel(context.Background()) |
|
defer cancel() |
|
|
|
for { |
|
select { |
|
case <-renewalTicker.C: |
|
err := certCache.RenewManagedCertificates(ctx) |
|
if err != nil && log != nil { |
|
log.Error("renewing managed certificates", zap.Error(err)) |
|
} |
|
case <-ocspTicker.C: |
|
certCache.updateOCSPStaples(ctx) |
|
case <-certCache.stopChan: |
|
renewalTicker.Stop() |
|
ocspTicker.Stop() |
|
// TODO: stop any in-progress maintenance operations and clear locks we made (this might be done now with our use of context) |
|
if log != nil { |
|
log.Info("stopped background certificate maintenance") |
|
} |
|
close(certCache.doneChan) |
|
return |
|
} |
|
} |
|
} |
|
|
|
// RenewManagedCertificates renews managed certificates, |
|
// including ones loaded on-demand. Note that this is done |
|
// automatically on a regular basis; normally you will not |
|
// need to call this. This method assumes non-interactive |
|
// mode (i.e. operating in the background). |
|
func (certCache *Cache) RenewManagedCertificates(ctx context.Context) error { |
|
log := loggerNamed(certCache.logger, "maintenance") |
|
|
|
// configs will hold a map of certificate name to the config |
|
// to use when managing that certificate |
|
configs := make(map[string]*Config) |
|
|
|
// we use the queues for a very important reason: to do any and all |
|
// operations that could require an exclusive write lock outside |
|
// of the read lock! otherwise we get a deadlock, yikes. in other |
|
// words, our first iteration through the certificate cache does NOT |
|
// perform any operations--only queues them--so that more fine-grained |
|
// write locks may be obtained during the actual operations. |
|
var renewQueue, reloadQueue, deleteQueue []Certificate |
|
|
|
certCache.mu.RLock() |
|
for certKey, cert := range certCache.cache { |
|
if !cert.managed { |
|
continue |
|
} |
|
|
|
// the list of names on this cert should never be empty... programmer error? |
|
if cert.Names == nil || len(cert.Names) == 0 { |
|
if log != nil { |
|
log.Warn("certificate has no names; removing from cache", zap.String("cert_key", certKey)) |
|
} |
|
deleteQueue = append(deleteQueue, cert) |
|
continue |
|
} |
|
|
|
// get the config associated with this certificate |
|
cfg, err := certCache.getConfig(cert) |
|
if err != nil { |
|
if log != nil { |
|
log.Error("unable to get configuration to manage certificate; unable to renew", |
|
zap.Strings("identifiers", cert.Names), |
|
zap.Error(err)) |
|
} |
|
continue |
|
} |
|
if cfg == nil { |
|
// this is bad if this happens, probably a programmer error (oops) |
|
if log != nil { |
|
log.Error("no configuration associated with certificate; unable to manage", |
|
zap.Strings("identifiers", cert.Names)) |
|
} |
|
continue |
|
} |
|
|
|
// if time is up or expires soon, we need to try to renew it |
|
if cert.NeedsRenewal(cfg) { |
|
configs[cert.Names[0]] = cfg |
|
|
|
// see if the certificate in storage has already been renewed, possibly by another |
|
// instance that didn't coordinate with this one; if so, just load it (this |
|
// might happen if another instance already renewed it - kinda sloppy but checking disk |
|
// first is a simple way to possibly drastically reduce rate limit problems) |
|
storedCertExpiring, err := cfg.managedCertInStorageExpiresSoon(cert) |
|
if err != nil { |
|
// hmm, weird, but not a big deal, maybe it was deleted or something |
|
if log != nil { |
|
log.Warn("error while checking if stored certificate is also expiring soon", |
|
zap.Strings("identifiers", cert.Names), |
|
zap.Error(err)) |
|
} |
|
} else if !storedCertExpiring { |
|
// if the certificate is NOT expiring soon and there was no error, then we |
|
// are good to just reload the certificate from storage instead of repeating |
|
// a likely-unnecessary renewal procedure |
|
reloadQueue = append(reloadQueue, cert) |
|
continue |
|
} |
|
|
|
// the certificate in storage has not been renewed yet, so we will do it |
|
// NOTE: It is super-important to note that the TLS-ALPN challenge requires |
|
// a write lock on the cache in order to complete its challenge, so it is extra |
|
// vital that this renew operation does not happen inside our read lock! |
|
renewQueue = append(renewQueue, cert) |
|
} |
|
} |
|
certCache.mu.RUnlock() |
|
|
|
// Reload certificates that merely need to be updated in memory |
|
for _, oldCert := range reloadQueue { |
|
timeLeft := oldCert.Leaf.NotAfter.Sub(time.Now().UTC()) |
|
if log != nil { |
|
log.Info("certificate expires soon, but is already renewed in storage; reloading stored certificate", |
|
zap.Strings("identifiers", oldCert.Names), |
|
zap.Duration("remaining", timeLeft)) |
|
} |
|
|
|
cfg := configs[oldCert.Names[0]] |
|
|
|
// crucially, this happens OUTSIDE a lock on the certCache |
|
err := cfg.reloadManagedCertificate(oldCert) |
|
if err != nil { |
|
if log != nil { |
|
log.Error("loading renewed certificate", |
|
zap.Strings("identifiers", oldCert.Names), |
|
zap.Error(err)) |
|
} |
|
continue |
|
} |
|
} |
|
|
|
// Renewal queue |
|
for _, oldCert := range renewQueue { |
|
cfg := configs[oldCert.Names[0]] |
|
err := certCache.queueRenewalTask(ctx, oldCert, cfg) |
|
if err != nil { |
|
if log != nil { |
|
log.Error("queueing renewal task", |
|
zap.Strings("identifiers", oldCert.Names), |
|
zap.Error(err)) |
|
} |
|
continue |
|
} |
|
} |
|
|
|
// Deletion queue |
|
certCache.mu.Lock() |
|
for _, cert := range deleteQueue { |
|
certCache.removeCertificate(cert) |
|
} |
|
certCache.mu.Unlock() |
|
|
|
return nil |
|
} |
|
|
|
func (certCache *Cache) queueRenewalTask(ctx context.Context, oldCert Certificate, cfg *Config) error { |
|
log := loggerNamed(certCache.logger, "maintenance") |
|
|
|
timeLeft := oldCert.Leaf.NotAfter.Sub(time.Now().UTC()) |
|
if log != nil { |
|
log.Info("certificate expires soon; queuing for renewal", |
|
zap.Strings("identifiers", oldCert.Names), |
|
zap.Duration("remaining", timeLeft)) |
|
} |
|
|
|
// Get the name which we should use to renew this certificate; |
|
// we only support managing certificates with one name per cert, |
|
// so this should be easy. |
|
renewName := oldCert.Names[0] |
|
|
|
// queue up this renewal job (is a no-op if already active or queued) |
|
jm.Submit(cfg.Logger, "renew_"+renewName, func() error { |
|
timeLeft := oldCert.Leaf.NotAfter.Sub(time.Now().UTC()) |
|
if log != nil { |
|
log.Info("attempting certificate renewal", |
|
zap.Strings("identifiers", oldCert.Names), |
|
zap.Duration("remaining", timeLeft)) |
|
} |
|
|
|
// perform renewal - crucially, this happens OUTSIDE a lock on certCache |
|
err := cfg.RenewCert(ctx, renewName, false) |
|
if err != nil { |
|
if cfg.OnDemand != nil { |
|
// loaded dynamically, remove dynamically |
|
certCache.mu.Lock() |
|
certCache.removeCertificate(oldCert) |
|
certCache.mu.Unlock() |
|
} |
|
return fmt.Errorf("%v %v", oldCert.Names, err) |
|
} |
|
|
|
// successful renewal, so update in-memory cache by loading |
|
// renewed certificate so it will be used with handshakes |
|
err = cfg.reloadManagedCertificate(oldCert) |
|
if err != nil { |
|
return ErrNoRetry{fmt.Errorf("%v %v", oldCert.Names, err)} |
|
} |
|
return nil |
|
}) |
|
|
|
return nil |
|
} |
|
|
|
// updateOCSPStaples updates the OCSP stapling in all |
|
// eligible, cached certificates. |
|
// |
|
// OCSP maintenance strives to abide the relevant points on |
|
// Ryan Sleevi's recommendations for good OCSP support: |
|
// https://gist.github.com/sleevi/5efe9ef98961ecfb4da8 |
|
func (certCache *Cache) updateOCSPStaples(ctx context.Context) { |
|
log := loggerNamed(certCache.logger, "maintenance") |
|
|
|
// temporary structures to store updates or tasks |
|
// so that we can keep our locks short-lived |
|
type ocspUpdate struct { |
|
rawBytes []byte |
|
parsed *ocsp.Response |
|
} |
|
type updateQueueEntry struct { |
|
cert Certificate |
|
certHash string |
|
lastNextUpdate time.Time |
|
} |
|
updated := make(map[string]ocspUpdate) |
|
var updateQueue []updateQueueEntry |
|
var renewQueue []Certificate |
|
configs := make(map[string]*Config) |
|
|
|
// obtain brief read lock during our scan to see which staples need updating |
|
certCache.mu.RLock() |
|
for certHash, cert := range certCache.cache { |
|
// no point in updating OCSP for expired or "synthetic" certificates |
|
if cert.Leaf == nil || cert.Expired() { |
|
continue |
|
} |
|
var lastNextUpdate time.Time |
|
if cert.ocsp != nil { |
|
lastNextUpdate = cert.ocsp.NextUpdate |
|
if freshOCSP(cert.ocsp) { |
|
continue // no need to update staple if ours is still fresh |
|
} |
|
} |
|
updateQueue = append(updateQueue, updateQueueEntry{cert, certHash, lastNextUpdate}) |
|
} |
|
certCache.mu.RUnlock() |
|
|
|
// perform updates outside of any lock on certCache |
|
for _, qe := range updateQueue { |
|
cert := qe.cert |
|
certHash := qe.certHash |
|
lastNextUpdate := qe.lastNextUpdate |
|
|
|
cfg, err := certCache.getConfig(cert) |
|
if err != nil { |
|
if log != nil { |
|
log.Error("unable to refresh OCSP staple because getting automation config for certificate failed", |
|
zap.Strings("identifiers", cert.Names), |
|
zap.Error(err)) |
|
} |
|
continue |
|
} |
|
if cfg == nil { |
|
// this is bad if this happens, probably a programmer error (oops) |
|
if log != nil { |
|
log.Error("no configuration associated with certificate; unable to manage OCSP staples", |
|
zap.Strings("identifiers", cert.Names)) |
|
} |
|
continue |
|
} |
|
|
|
ocspResp, err := stapleOCSP(cfg.Storage, &cert, nil) |
|
if err != nil { |
|
if cert.ocsp != nil { |
|
// if there was no staple before, that's fine; otherwise we should log the error |
|
if log != nil { |
|
log.Error("stapling OCSP", |
|
zap.Strings("identifiers", cert.Names), |
|
zap.Error(err)) |
|
} |
|
} |
|
continue |
|
} |
|
|
|
// By this point, we've obtained the latest OCSP response. |
|
// If there was no staple before, or if the response is updated, make |
|
// sure we apply the update to all names on the certificate. |
|
if cert.ocsp != nil && (lastNextUpdate.IsZero() || lastNextUpdate != cert.ocsp.NextUpdate) { |
|
if log != nil { |
|
log.Info("advancing OCSP staple", |
|
zap.Strings("identifiers", cert.Names), |
|
zap.Time("from", lastNextUpdate), |
|
zap.Time("to", cert.ocsp.NextUpdate)) |
|
} |
|
updated[certHash] = ocspUpdate{rawBytes: cert.Certificate.OCSPStaple, parsed: cert.ocsp} |
|
} |
|
|
|
// If a managed certificate was revoked, we should attempt |
|
// to replace it with a new one. If that fails, oh well. |
|
if cert.managed && ocspResp.Status == ocsp.Revoked && len(cert.Names) > 0 { |
|
renewQueue = append(renewQueue, cert) |
|
configs[cert.Names[0]] = cfg |
|
} |
|
} |
|
|
|
// These write locks should be brief since we have all the info we need now. |
|
for certKey, update := range updated { |
|
certCache.mu.Lock() |
|
cert := certCache.cache[certKey] |
|
cert.ocsp = update.parsed |
|
cert.Certificate.OCSPStaple = update.rawBytes |
|
certCache.cache[certKey] = cert |
|
certCache.mu.Unlock() |
|
} |
|
|
|
// We attempt to replace any certificates that were revoked. |
|
// Crucially, this happens OUTSIDE a lock on the certCache. |
|
for _, oldCert := range renewQueue { |
|
if log != nil { |
|
log.Warn("OCSP status for managed certificate is REVOKED; attempting to replace with new certificate", |
|
zap.Strings("identifiers", oldCert.Names), |
|
zap.Time("expiration", oldCert.Leaf.NotAfter)) |
|
} |
|
|
|
renewName := oldCert.Names[0] |
|
cfg := configs[renewName] |
|
|
|
// TODO: consider using a new key in this situation, but we don't know if key storage has been compromised... |
|
err := cfg.RenewCert(ctx, renewName, false) |
|
if err != nil { |
|
// probably better to not serve a revoked certificate at all |
|
if log != nil { |
|
log.Error("unable to obtain new to certificate after OCSP status of REVOKED; removing from cache", |
|
zap.Strings("identifiers", oldCert.Names), |
|
zap.Error(err)) |
|
} |
|
certCache.mu.Lock() |
|
certCache.removeCertificate(oldCert) |
|
certCache.mu.Unlock() |
|
continue |
|
} |
|
err = cfg.reloadManagedCertificate(oldCert) |
|
if err != nil { |
|
if log != nil { |
|
log.Error("after obtaining new certificate due to OCSP status of REVOKED", |
|
zap.Strings("identifiers", oldCert.Names), |
|
zap.Error(err)) |
|
} |
|
continue |
|
} |
|
} |
|
} |
|
|
|
// CleanStorageOptions specifies how to clean up a storage unit. |
|
type CleanStorageOptions struct { |
|
OCSPStaples bool |
|
ExpiredCerts bool |
|
ExpiredCertGracePeriod time.Duration |
|
} |
|
|
|
// CleanStorage removes assets which are no longer useful, |
|
// according to opts. |
|
func CleanStorage(ctx context.Context, storage Storage, opts CleanStorageOptions) { |
|
if opts.OCSPStaples { |
|
err := deleteOldOCSPStaples(ctx, storage) |
|
if err != nil { |
|
log.Printf("[ERROR] Deleting old OCSP staples: %v", err) |
|
} |
|
} |
|
if opts.ExpiredCerts { |
|
err := deleteExpiredCerts(ctx, storage, opts.ExpiredCertGracePeriod) |
|
if err != nil { |
|
log.Printf("[ERROR] Deleting expired certificates: %v", err) |
|
} |
|
} |
|
// TODO: delete stale locks? |
|
} |
|
|
|
func deleteOldOCSPStaples(ctx context.Context, storage Storage) error { |
|
ocspKeys, err := storage.List(prefixOCSP, false) |
|
if err != nil { |
|
// maybe just hasn't been created yet; no big deal |
|
return nil |
|
} |
|
for _, key := range ocspKeys { |
|
// if context was cancelled, quit early; otherwise proceed |
|
select { |
|
case <-ctx.Done(): |
|
return ctx.Err() |
|
default: |
|
} |
|
ocspBytes, err := storage.Load(key) |
|
if err != nil { |
|
log.Printf("[ERROR] While deleting old OCSP staples, unable to load staple file: %v", err) |
|
continue |
|
} |
|
resp, err := ocsp.ParseResponse(ocspBytes, nil) |
|
if err != nil { |
|
// contents are invalid; delete it |
|
err = storage.Delete(key) |
|
if err != nil { |
|
log.Printf("[ERROR] Purging corrupt staple file %s: %v", key, err) |
|
} |
|
continue |
|
} |
|
if time.Now().After(resp.NextUpdate) { |
|
// response has expired; delete it |
|
err = storage.Delete(key) |
|
if err != nil { |
|
log.Printf("[ERROR] Purging expired staple file %s: %v", key, err) |
|
} |
|
} |
|
} |
|
return nil |
|
} |
|
|
|
func deleteExpiredCerts(ctx context.Context, storage Storage, gracePeriod time.Duration) error { |
|
issuerKeys, err := storage.List(prefixCerts, false) |
|
if err != nil { |
|
// maybe just hasn't been created yet; no big deal |
|
return nil |
|
} |
|
|
|
for _, issuerKey := range issuerKeys { |
|
siteKeys, err := storage.List(issuerKey, false) |
|
if err != nil { |
|
log.Printf("[ERROR] Listing contents of %s: %v", issuerKey, err) |
|
continue |
|
} |
|
|
|
for _, siteKey := range siteKeys { |
|
// if context was cancelled, quit early; otherwise proceed |
|
select { |
|
case <-ctx.Done(): |
|
return ctx.Err() |
|
default: |
|
} |
|
|
|
siteAssets, err := storage.List(siteKey, false) |
|
if err != nil { |
|
log.Printf("[ERROR] Listing contents of %s: %v", siteKey, err) |
|
continue |
|
} |
|
|
|
for _, assetKey := range siteAssets { |
|
if path.Ext(assetKey) != ".crt" { |
|
continue |
|
} |
|
|
|
certFile, err := storage.Load(assetKey) |
|
if err != nil { |
|
return fmt.Errorf("loading certificate file %s: %v", assetKey, err) |
|
} |
|
block, _ := pem.Decode(certFile) |
|
if block == nil || block.Type != "CERTIFICATE" { |
|
return fmt.Errorf("certificate file %s does not contain PEM-encoded certificate", assetKey) |
|
} |
|
cert, err := x509.ParseCertificate(block.Bytes) |
|
if err != nil { |
|
return fmt.Errorf("certificate file %s is malformed; error parsing PEM: %v", assetKey, err) |
|
} |
|
|
|
if expiredTime := time.Since(cert.NotAfter); expiredTime >= gracePeriod { |
|
log.Printf("[INFO] Certificate %s expired %s ago; cleaning up", assetKey, expiredTime) |
|
baseName := strings.TrimSuffix(assetKey, ".crt") |
|
for _, relatedAsset := range []string{ |
|
assetKey, |
|
baseName + ".key", |
|
baseName + ".json", |
|
} { |
|
log.Printf("[INFO] Deleting %s because resource expired", relatedAsset) |
|
err := storage.Delete(relatedAsset) |
|
if err != nil { |
|
log.Printf("[ERROR] Cleaning up asset related to expired certificate for %s: %s: %v", |
|
baseName, relatedAsset, err) |
|
} |
|
} |
|
} |
|
} |
|
|
|
// update listing; if folder is empty, delete it |
|
siteAssets, err = storage.List(siteKey, false) |
|
if err != nil { |
|
continue |
|
} |
|
if len(siteAssets) == 0 { |
|
log.Printf("[INFO] Deleting %s because key is empty", siteKey) |
|
err := storage.Delete(siteKey) |
|
if err != nil { |
|
return fmt.Errorf("deleting empty site folder %s: %v", siteKey, err) |
|
} |
|
} |
|
} |
|
} |
|
return nil |
|
} |
|
|
|
const ( |
|
// DefaultRenewCheckInterval is how often to check certificates for expiration. |
|
// Scans are very lightweight, so this can be semi-frequent. This default should |
|
// be smaller than <Minimum Cert Lifetime>*DefaultRenewalWindowRatio/3, which |
|
// gives certificates plenty of chance to be renewed on time. |
|
DefaultRenewCheckInterval = 10 * time.Minute |
|
|
|
// DefaultRenewalWindowRatio is how much of a certificate's lifetime becomes the |
|
// renewal window. The renewal window is the span of time at the end of the |
|
// certificate's validity period in which it should be renewed. A default value |
|
// of ~1/3 is pretty safe and recommended for most certificates. |
|
DefaultRenewalWindowRatio = 1.0 / 3.0 |
|
|
|
// DefaultOCSPCheckInterval is how often to check if OCSP stapling needs updating. |
|
DefaultOCSPCheckInterval = 1 * time.Hour |
|
)
|
|
|