mattermost/server/public/plugin/supervisor.go

306 lines
7.9 KiB
Go
Raw Permalink Normal View History

// Copyright (c) 2015-present Mattermost, Inc. All Rights Reserved.
// See LICENSE.txt for license information.
package plugin
import (
"crypto/sha256"
"fmt"
"io"
"os"
"os/exec"
"path/filepath"
"runtime"
"strings"
"sync"
"time"
plugin "github.com/hashicorp/go-plugin"
"github.com/pkg/errors"
"github.com/mattermost/mattermost/server/public/model"
"github.com/mattermost/mattermost/server/public/shared/mlog"
)
type supervisor struct {
lock sync.RWMutex
pluginID string
appDriver AppDriver
client *plugin.Client
hooks Hooks
implemented [TotalHooksID]bool
hooksClient *hooksRPCClient
isReattached bool
apiServerCleanup func() // Cleanup function for Python plugin API server
}
type driverForPlugin struct {
AppDriver
pluginID string
}
func (d *driverForPlugin) Conn(isMaster bool) (string, error) {
return d.AppDriver.ConnWithPluginID(isMaster, d.pluginID)
}
func WithExecutableFromManifest(pluginInfo *model.BundleInfo) func(*supervisor, *plugin.ClientConfig) error {
return func(_ *supervisor, clientConfig *plugin.ClientConfig) error {
executable := pluginInfo.Manifest.GetExecutableForRuntime(runtime.GOOS, runtime.GOARCH)
if executable == "" {
return fmt.Errorf("backend executable not found for environment: %s/%s", runtime.GOOS, runtime.GOARCH)
}
executable = filepath.Clean(filepath.Join(".", executable))
if strings.HasPrefix(executable, "..") {
return fmt.Errorf("invalid backend executable: %s", executable)
}
executable = filepath.Join(pluginInfo.Path, executable)
cmd := exec.Command(executable)
// This doesn't add more security than before
// but removes the SecureConfig is nil warning.
// https://mattermost.atlassian.net/browse/MM-49167
pluginChecksum, err := getPluginExecutableChecksum(executable)
if err != nil {
return errors.Wrapf(err, "unable to generate plugin checksum")
}
clientConfig.Cmd = cmd
clientConfig.SecureConfig = &plugin.SecureConfig{
Checksum: pluginChecksum,
Hash: sha256.New(),
}
return nil
}
}
func WithReattachConfig(pluginReattachConfig *model.PluginReattachConfig) func(*supervisor, *plugin.ClientConfig) error {
return func(sup *supervisor, clientConfig *plugin.ClientConfig) error {
clientConfig.Reattach = pluginReattachConfig.ToHashicorpPluginReattachmentConfig()
sup.isReattached = true
return nil
}
}
func newSupervisor(pluginInfo *model.BundleInfo, apiImpl API, driver AppDriver, parentLogger *mlog.Logger, metrics metricsInterface, opts ...func(*supervisor, *plugin.ClientConfig) error) (retSupervisor *supervisor, retErr error) {
sup := supervisor{
pluginID: pluginInfo.Manifest.Id,
}
if driver != nil {
sup.appDriver = &driverForPlugin{AppDriver: driver, pluginID: pluginInfo.Manifest.Id}
}
defer func() {
if retErr != nil {
sup.Shutdown()
}
}()
wrappedLogger := pluginInfo.WrapLogger(parentLogger)
hclogAdaptedLogger := &hclogAdapter{
wrappedLogger: wrappedLogger,
extrasKey: "wrapped_extras",
}
pluginMap := map[string]plugin.Plugin{
"hooks": &hooksPlugin{
DB driver implementation via RPC (#17779) This PR builds up on the pass-through DB driver to a fully functioning DB driver implementation via our RPC layer. To keep things separate from the plugin RPC API, and have the ability to move fast with changes, a separate field Driver is added to MattermostPlugin. Typically the field which is required to be compatible are the API and Helpers. It would be well-documented that Driver is purely for internal use by Mattermost plugins. A new Driver interface was created which would have a client and server implementation. Every object (connection, statement, etc.) is created and added to a map on the server side. On the client side, the wrapper structs hold the object id, and communicate via the RPC API using this id. When the server gets the object id, it picks up the appropriate object from its map and performs the operation, and sends back the data. Some things that need to be handled are errors. Typical error types like pq.Error and mysql.MySQLError are registered with encoding/gob. But for error variables like sql.ErrNoRows, a special integer is encoded with the ErrorString struct. And on the cilent side, the integer is checked, and the appropriate error variable is returned. Some pending things: - Context support. This is tricky. Since context.Context is an interface, it's not possible to marshal it. We have to find a way to get the timeout value from the context and pass it. - RowsColumnScanType(rowsID string, index int) reflect.Type API. Again, reflect.Type is an interface. - Master/Replica API support.
2021-06-16 23:23:52 -04:00
log: wrappedLogger,
driverImpl: sup.appDriver,
DB driver implementation via RPC (#17779) This PR builds up on the pass-through DB driver to a fully functioning DB driver implementation via our RPC layer. To keep things separate from the plugin RPC API, and have the ability to move fast with changes, a separate field Driver is added to MattermostPlugin. Typically the field which is required to be compatible are the API and Helpers. It would be well-documented that Driver is purely for internal use by Mattermost plugins. A new Driver interface was created which would have a client and server implementation. Every object (connection, statement, etc.) is created and added to a map on the server side. On the client side, the wrapper structs hold the object id, and communicate via the RPC API using this id. When the server gets the object id, it picks up the appropriate object from its map and performs the operation, and sends back the data. Some things that need to be handled are errors. Typical error types like pq.Error and mysql.MySQLError are registered with encoding/gob. But for error variables like sql.ErrNoRows, a special integer is encoded with the ErrorString struct. And on the cilent side, the integer is checked, and the appropriate error variable is returned. Some pending things: - Context support. This is tricky. Since context.Context is an interface, it's not possible to marshal it. We have to find a way to get the timeout value from the context and pass it. - RowsColumnScanType(rowsID string, index int) reflect.Type API. Again, reflect.Type is an interface. - Master/Replica API support.
2021-06-16 23:23:52 -04:00
apiImpl: &apiTimerLayer{pluginInfo.Manifest.Id, apiImpl, metrics},
},
}
clientConfig := &plugin.ClientConfig{
HandshakeConfig: handshake,
Plugins: pluginMap,
SyncStdout: wrappedLogger.With(mlog.String("source", "plugin_stdout")).StdLogWriter(),
SyncStderr: wrappedLogger.With(mlog.String("source", "plugin_stderr")).StdLogWriter(),
Logger: hclogAdaptedLogger,
StartTimeout: time.Second * 3,
}
// Check if this is a Python plugin and configure accordingly
isPython := isPythonPlugin(pluginInfo.Manifest)
if isPython {
// Python plugins use gRPC protocol instead of net/rpc
// go-plugin defaults to net/rpc only when AllowedProtocols is nil,
// so we must explicitly allow gRPC for Python plugins
clientConfig.AllowedProtocols = []plugin.Protocol{plugin.ProtocolGRPC}
// Increase start timeout for Python plugins to account for interpreter
// startup time and module imports
clientConfig.StartTimeout = time.Second * 10
}
for _, opt := range opts {
err := opt(&sup, clientConfig)
if err != nil {
return nil, errors.Wrap(err, "failed to apply option")
}
}
sup.client = plugin.NewClient(clientConfig)
rpcClient, err := sup.client.Client()
if err != nil {
return nil, err
}
// Python plugins use gRPC for hook communication.
if isPython {
// Extract the gRPC connection from go-plugin's client.
grpcClient, ok := rpcClient.(*plugin.GRPCClient)
if !ok {
return nil, errors.New("expected gRPC client for Python plugin")
}
conn := grpcClient.Conn
hooksClient, err := newHooksGRPCClient(conn, wrappedLogger)
if err != nil {
return nil, errors.Wrap(err, "failed to create gRPC hooks client")
}
sup.hooks = &hooksTimerLayer{pluginInfo.Manifest.Id, hooksClient, metrics}
// Populate implemented array from gRPC client
impl, err := hooksClient.Implemented()
if err != nil {
return nil, errors.Wrap(err, "failed to query implemented hooks")
}
for _, hookName := range impl {
if hookId, ok := hookNameToId[hookName]; ok {
sup.implemented[hookId] = true
}
}
wrappedLogger.Info("Python plugin started with gRPC hooks")
return &sup, nil
}
raw, err := rpcClient.Dispense("hooks")
if err != nil {
return nil, err
}
MM-37165: Fix improper plugin shutdown (#18044) * MM-37165: Fix improper plugin shutdown This was caught from a race test failure. While the failure manifested due to a log being written from a test after the test exited, the real reason was hidden further deeper. What was happening is that the server would always listen for plugin requests in a separate goroutine via `g.muxBroker.AcceptAndServe` in the `OnActivate` hook. But the plugin shutdown process would just close the plugin connections and move on, leading to the classic case of improperly shut down goroutines. When this happened, an opportunity opens up in a way that the server would still be executing a request whereas the main goroutine and therefore the parent test has already finished. This would lead to an error like ``` {"level":"error","ts":1626451258.4141135,"caller":"mlog/sugar.go:25","msg":"pluginAPI scheduleOnce poller encountered an error but is still polling","plugin_id":"com.mattermost.plugin-incident-management","error":"ListPluginKeys: Unable to list all the plugin keys., failed to get PluginKeyValues with pluginId=com.mattermost.plugin-incident-management: sql: database is closed ``` And now, this finally calls `mlog.Error`, which finally triggers our race condition :) To fix this, we use some basic synchronization via waitgroups and just wait for it to finish after closing the plugin process. https://mattermost.atlassian.net/browse/MM-37165 ```release-note NONE ``` * gofmt ```release-note NONE ``` * split waitgroup additions ```release-note NONE ```
2021-08-10 00:37:35 -04:00
c, ok := raw.(*hooksRPCClient)
if ok {
sup.hooksClient = c
}
sup.hooks = &hooksTimerLayer{pluginInfo.Manifest.Id, raw.(Hooks), metrics}
impl, err := sup.hooks.Implemented()
if err != nil {
return nil, err
}
for _, hookName := range impl {
if hookId, ok := hookNameToId[hookName]; ok {
sup.implemented[hookId] = true
}
}
return &sup, nil
}
func (sup *supervisor) Shutdown() {
sup.lock.RLock()
defer sup.lock.RUnlock()
if sup.client != nil {
// For reattached plugins, Kill() is mostly a no-op, so manually clean up the
// underlying rpcClient. This might be something to upstream unless we're doing
// something else wrong.
if sup.isReattached {
rpcClient, err := sup.client.Client()
if err != nil {
mlog.Warn("Failed to obtain rpcClient on Shutdown")
} else {
if err = rpcClient.Close(); err != nil {
mlog.Warn("Failed to close rpcClient on Shutdown")
}
}
}
sup.client.Kill()
}
MM-37165: Fix improper plugin shutdown (#18044) * MM-37165: Fix improper plugin shutdown This was caught from a race test failure. While the failure manifested due to a log being written from a test after the test exited, the real reason was hidden further deeper. What was happening is that the server would always listen for plugin requests in a separate goroutine via `g.muxBroker.AcceptAndServe` in the `OnActivate` hook. But the plugin shutdown process would just close the plugin connections and move on, leading to the classic case of improperly shut down goroutines. When this happened, an opportunity opens up in a way that the server would still be executing a request whereas the main goroutine and therefore the parent test has already finished. This would lead to an error like ``` {"level":"error","ts":1626451258.4141135,"caller":"mlog/sugar.go:25","msg":"pluginAPI scheduleOnce poller encountered an error but is still polling","plugin_id":"com.mattermost.plugin-incident-management","error":"ListPluginKeys: Unable to list all the plugin keys., failed to get PluginKeyValues with pluginId=com.mattermost.plugin-incident-management: sql: database is closed ``` And now, this finally calls `mlog.Error`, which finally triggers our race condition :) To fix this, we use some basic synchronization via waitgroups and just wait for it to finish after closing the plugin process. https://mattermost.atlassian.net/browse/MM-37165 ```release-note NONE ``` * gofmt ```release-note NONE ``` * split waitgroup additions ```release-note NONE ```
2021-08-10 00:37:35 -04:00
// Wait for API RPC server and DB RPC server to exit.
// And then shutdown conns.
MM-37165: Fix improper plugin shutdown (#18044) * MM-37165: Fix improper plugin shutdown This was caught from a race test failure. While the failure manifested due to a log being written from a test after the test exited, the real reason was hidden further deeper. What was happening is that the server would always listen for plugin requests in a separate goroutine via `g.muxBroker.AcceptAndServe` in the `OnActivate` hook. But the plugin shutdown process would just close the plugin connections and move on, leading to the classic case of improperly shut down goroutines. When this happened, an opportunity opens up in a way that the server would still be executing a request whereas the main goroutine and therefore the parent test has already finished. This would lead to an error like ``` {"level":"error","ts":1626451258.4141135,"caller":"mlog/sugar.go:25","msg":"pluginAPI scheduleOnce poller encountered an error but is still polling","plugin_id":"com.mattermost.plugin-incident-management","error":"ListPluginKeys: Unable to list all the plugin keys., failed to get PluginKeyValues with pluginId=com.mattermost.plugin-incident-management: sql: database is closed ``` And now, this finally calls `mlog.Error`, which finally triggers our race condition :) To fix this, we use some basic synchronization via waitgroups and just wait for it to finish after closing the plugin process. https://mattermost.atlassian.net/browse/MM-37165 ```release-note NONE ``` * gofmt ```release-note NONE ``` * split waitgroup additions ```release-note NONE ```
2021-08-10 00:37:35 -04:00
if sup.hooksClient != nil {
sup.hooksClient.doneWg.Wait()
if sup.appDriver != nil {
sup.appDriver.ShutdownConns(sup.pluginID)
}
MM-37165: Fix improper plugin shutdown (#18044) * MM-37165: Fix improper plugin shutdown This was caught from a race test failure. While the failure manifested due to a log being written from a test after the test exited, the real reason was hidden further deeper. What was happening is that the server would always listen for plugin requests in a separate goroutine via `g.muxBroker.AcceptAndServe` in the `OnActivate` hook. But the plugin shutdown process would just close the plugin connections and move on, leading to the classic case of improperly shut down goroutines. When this happened, an opportunity opens up in a way that the server would still be executing a request whereas the main goroutine and therefore the parent test has already finished. This would lead to an error like ``` {"level":"error","ts":1626451258.4141135,"caller":"mlog/sugar.go:25","msg":"pluginAPI scheduleOnce poller encountered an error but is still polling","plugin_id":"com.mattermost.plugin-incident-management","error":"ListPluginKeys: Unable to list all the plugin keys., failed to get PluginKeyValues with pluginId=com.mattermost.plugin-incident-management: sql: database is closed ``` And now, this finally calls `mlog.Error`, which finally triggers our race condition :) To fix this, we use some basic synchronization via waitgroups and just wait for it to finish after closing the plugin process. https://mattermost.atlassian.net/browse/MM-37165 ```release-note NONE ``` * gofmt ```release-note NONE ``` * split waitgroup additions ```release-note NONE ```
2021-08-10 00:37:35 -04:00
}
// Clean up the Python plugin API server if it was started.
// This must happen AFTER the Python process terminates to allow graceful disconnect.
if sup.apiServerCleanup != nil {
sup.apiServerCleanup()
}
}
func (sup *supervisor) Hooks() Hooks {
sup.lock.RLock()
defer sup.lock.RUnlock()
return sup.hooks
}
// PerformHealthCheck checks the plugin through an an RPC ping.
func (sup *supervisor) PerformHealthCheck() error {
// No need for a lock here because Ping is read-locked.
if pingErr := sup.Ping(); pingErr != nil {
for pingFails := 1; pingFails < HealthCheckPingFailLimit; pingFails++ {
pingErr = sup.Ping()
if pingErr == nil {
break
}
}
if pingErr != nil {
return fmt.Errorf("plugin RPC connection is not responding")
}
}
return nil
}
// Ping checks that the RPC connection with the plugin is alive and healthy.
func (sup *supervisor) Ping() error {
sup.lock.RLock()
defer sup.lock.RUnlock()
client, err := sup.client.Client()
if err != nil {
return err
}
return client.Ping()
}
func (sup *supervisor) Implements(hookId int) bool {
sup.lock.RLock()
defer sup.lock.RUnlock()
return sup.implemented[hookId]
}
func getPluginExecutableChecksum(executablePath string) ([]byte, error) {
pathHash := sha256.New()
file, err := os.Open(executablePath)
if err != nil {
return nil, err
}
defer file.Close()
_, err = io.Copy(pathHash, file)
if err != nil {
return nil, err
}
return pathHash.Sum(nil), nil
}