From 2e0f7775710985b8230428f3133ed6f99215dca3 Mon Sep 17 00:00:00 2001 From: Simon Marlow Date: Wed, 25 Mar 2026 12:19:58 +0000 Subject: [PATCH] Add internal query linter When working on the compiler it's easy to accidentally break something that results in a segfault. Most of these errors can be detected by checks on the Codegen IR, in particular correct type-safety and variable-scoping. - Add Glean.Query.Lint to implement query consistency checks - Turn on query linting with `GLEAN_DEBUG=lint` or `--debug-query-lint` - Linting is automatically enabled for all tests via `withTestEnv` --- glean.cabal.in | 1 + glean/db/Glean/Database/Config.hs | 5 +- glean/db/Glean/Database/Env.hs | 1 + glean/db/Glean/Query/Lint.hs | 290 ++++++++++++++++++++++++++ glean/db/Glean/Query/UserQuery.hs | 7 + glean/hs/Glean/RTS/Types.hs | 1 + glean/test/lib/Glean/Database/Test.hs | 9 +- glean/website/docs/running.md | 3 + 8 files changed, 314 insertions(+), 3 deletions(-) create mode 100644 glean/db/Glean/Query/Lint.hs diff --git a/glean.cabal.in b/glean.cabal.in index 35c872ce5..d7e316e82 100644 --- a/glean.cabal.in +++ b/glean.cabal.in @@ -651,6 +651,7 @@ library db Glean.Query.Expand Glean.Query.Flatten Glean.Query.Flatten.Types + Glean.Query.Lint Glean.Query.Opt Glean.Query.Prune Glean.Query.Reorder diff --git a/glean/db/Glean/Database/Config.hs b/glean/db/Glean/Database/Config.hs index fece5e72d..9dc7d4032 100644 --- a/glean/db/Glean/Database/Config.hs +++ b/glean/db/Glean/Database/Config.hs @@ -204,15 +204,17 @@ data Config = Config data DebugFlags = DebugFlags { tcDebug :: !Bool , queryDebug :: !Bool + , queryLint :: !Bool } instance Default DebugFlags where - def = DebugFlags { tcDebug = False, queryDebug = False } + def = DebugFlags { tcDebug = False, queryDebug = False, queryLint = False } instance Semigroup DebugFlags where a <> b = DebugFlags { tcDebug = tcDebug a || tcDebug b , queryDebug = queryDebug a || queryDebug b + , queryLint = queryLint a || queryLint b } instance Monoid DebugFlags where @@ -561,6 +563,7 @@ options = do debugParser = do tcDebug <- switch (long "debug-tc") queryDebug <- switch (long "debug-query") + queryLint <- switch (long "debug-query-lint") return DebugFlags{..} serverConfigThriftSource = option (eitherReader ThriftSource.parse) diff --git a/glean/db/Glean/Database/Env.hs b/glean/db/Glean/Database/Env.hs index 84faf01ca..1121cebaf 100644 --- a/glean/db/Glean/Database/Env.hs +++ b/glean/db/Glean/Database/Env.hs @@ -161,6 +161,7 @@ getDebugEnv = do where add "tc" = return def { tcDebug = True } add "query" = return def { queryDebug = True } + add "lint" = return def { queryLint = True } add other = do logWarning $ "Unkonwn GLEAN_DEBUG class: " <> other return def diff --git a/glean/db/Glean/Query/Lint.hs b/glean/db/Glean/Query/Lint.hs new file mode 100644 index 000000000..9c9f3a119 --- /dev/null +++ b/glean/db/Glean/Query/Lint.hs @@ -0,0 +1,290 @@ +{- + Copyright (c) Meta Platforms, Inc. and affiliates. + All rights reserved. + + This source code is licensed under the BSD-style license found in the + LICENSE file in the root directory of this source tree. +-} + +module Glean.Query.Lint ( + lintQuery + ) where + +import Control.Monad +import Control.Monad.Except +import Control.Monad.State +import Data.List (foldl') +import Data.IntSet (IntSet) +import qualified Data.IntSet as IntSet +import Compat.Prettyprinter + +import Glean.Angle.Types + (Type_(..), latestAngleVersion, tempPredicateId) +import Glean.Database.Schema.Types + (DbSchema, PredicateDetails(..), lookupPid, lookupPredicateId) +import Glean.Display +import Glean.Query.Codegen.Types +import Glean.RTS.Term as RTS +import Glean.RTS.Types as RTS +import Glean.Schema.Util (tupleSchema) + +-- | The lint monad: tracks the set of bound variable IDs in state, +-- and can throw a pretty-printed error. +type L a = StateT IntSet (Except (Doc ())) a + +runL :: L a -> Either (Doc ()) a +runL m = runExcept (evalStateT m IntSet.empty) + +lintError :: Doc () -> L a +lintError = throwError + +-- | Record a variable as bound. +bindVar :: Var -> L () +bindVar v = modify (IntSet.insert (varId v)) + +-- | Check that a variable is currently bound. +requireBound :: Var -> L () +requireBound v = do + bound <- get + unless (varId v `IntSet.member` bound) $ + lintError $ "unbound variable:" <+> displayDefault v + +-- | Run an action in a nested scope. The bound set is saved and +-- restored afterwards, so bindings made inside do not escape. +scoped :: L a -> L a +scoped m = do + saved <- get + a <- m + put saved + return a + +{- + Check the query for internal consistency. lintQuery will check + the following properties of the query: + + - type-correctness, e.g. + - in A = B, the types of A and B are equivalent + - in FactGenerator and DerivedFactGenerator, the types of the key + and value pattern match those of the predicate in the schema + - arguments of primitives have the correct type + - bind-before-use: for all variables, MatchBind occurs before MatchVar + +-} +lintQuery :: DbSchema -> CodegenQuery -> Either (Doc ()) () +lintQuery schema QueryWithInfo{..} = runL $ do + let CgQuery head body = qiQuery + lintStatements schema body + lintPat head + case qiGenerator of + Nothing -> checkEqType "query head" head qiReturnType + Just gen -> do + lintGenerator gen + lintGeneratorTypes schema head gen + +-- --------------------------------------------------------------------------- +-- Bind-before-use and type checking for statement lists + +lintStatements :: DbSchema -> [CgStatement] -> L () +lintStatements schema = mapM_ (lintStatement schema) + +lintStatement :: DbSchema -> CgStatement -> L () +lintStatement schema = \case + CgStatement pat gen -> do + -- The generator is evaluated first; its patterns can bind variables + lintGenerator gen + -- Then the LHS pattern matches the generator result, and can use + -- variables bound by the generator as well as previously bound ones + lintPat pat + -- Type check: pattern type should match generator result type + lintStmtTypes schema pat gen + + CgAllStatement var expr stmts -> do + -- The inner statements are evaluated in a nested scope + scoped $ do + lintStatements schema stmts + lintPat expr + -- The variable bound by 'all' is available to subsequent statements + bindVar var + + CgNegation stmts -> + -- Negation introduces its own scope; bindings do not escape + scoped $ lintStatements schema stmts + + CgDisjunction stmtss -> do + -- Each branch starts with the same bound set. Variables bound + -- in all branches are available afterwards. + bounds <- forM stmtss $ \stmts -> scoped $ do + lintStatements schema stmts + get + case bounds of + [] -> return () + (b:bs) -> put (foldl' IntSet.intersection b bs) + + CgConditional{..} -> do + outer <- get + lintStatements schema cond + -- then branch sees cond bindings + lintStatements schema then_ + thenBound <- get + -- else branch only sees pre-condition bindings + put outer + lintStatements schema else_ + elseBound <- get + -- Variables bound in both branches are available after + put (IntSet.intersection thenBound elseBound) + +-- --------------------------------------------------------------------------- +-- Check uses and record bindings in a single depth-first left-to-right +-- traversal, so that a MatchBind makes the variable immediately +-- available for a later MatchVar in the same pattern. + +lintPat :: Pat -> L () +lintPat = mapM_ lintMatch + +lintMatch :: Match () Var -> L () +lintMatch = \case + MatchBind v -> bindVar v + MatchVar v -> requireBound v + MatchAnd a b -> do + lintPat a + lintPat b + MatchPrefix _ rest -> + lintPat rest + MatchArrayPrefix _ pre whole -> do + mapM_ lintPat pre + lintPat whole + _ -> return () + +-- | Lint expression-position subterms of a generator, then bind +-- pattern-position variables. +lintGenerator :: Generator -> L () +lintGenerator = \case + FactGenerator _ kpat vpat _ -> do + lintPat kpat + lintPat vpat + TermGenerator expr -> + lintPat expr + DerivedFactGenerator _ key value -> do + lintPat key + lintPat value + ArrayElementGenerator _ arr -> + lintPat arr + SetElementGenerator _ set -> + lintPat set + PrimCall _ args _ -> + mapM_ lintPat args + +-- | Type-check the result generator (qiGenerator). The head expression +-- is the fact ID that the generator will look up. +lintGeneratorTypes :: DbSchema -> Pat -> Generator -> L () +lintGeneratorTypes schema head gen = case gen of + FactGenerator pidRef kpat vpat _ -> do + mdetails <- lookupPredicate schema pidRef + forM_ mdetails $ \details -> do + checkEqType + ("result generator key of" <+> displayDefault pidRef) + kpat (predicateKeyType details) + checkEqType + ("result generator value of" <+> displayDefault pidRef) + vpat (predicateValueType details) + checkEqType "result generator fact ID" head (PredicateTy () pidRef) + _ -> return () + +-- --------------------------------------------------------------------------- +-- Type checking + +-- | Check the type consistency of a statement: pattern type vs +-- generator result type, and predicate key/value types for fact +-- generators. +lintStmtTypes :: DbSchema -> Pat -> Generator -> L () +lintStmtTypes schema pat gen = case gen of + FactGenerator pidRef kpat vpat _ -> do + mdetails <- lookupPredicate schema pidRef + forM_ mdetails $ \details -> do + checkEqType + ("key of" <+> displayDefault pidRef) kpat (predicateKeyType details) + checkEqType + ("value of" <+> displayDefault pidRef) vpat (predicateValueType details) + -- The LHS pattern matches the fact ID, which has the predicate type + checkEqType "fact generator result" pat (PredicateTy () pidRef) + + DerivedFactGenerator pidRef key value -> do + mdetails <- lookupPredicate schema pidRef + forM_ mdetails $ \details -> do + checkEqType + ("key of" <+> displayDefault pidRef) key (predicateKeyType details) + checkEqType + ("value of" <+> displayDefault pidRef) value (predicateValueType details) + checkEqType "derived fact generator result" pat (PredicateTy () pidRef) + + TermGenerator expr -> + case (patType pat, patType expr) of + (Just pty, Just ety) -> typesShouldMatch "term generator" pty ety + _ -> return () + + ArrayElementGenerator eltTy _arr -> + checkEqType "array element generator" pat eltTy + + SetElementGenerator eltTy _set -> + checkEqType "set element generator" pat eltTy + + PrimCall _op _args retTy -> + checkEqType "primcall result" pat retTy + +lookupPredicate :: DbSchema -> PidRef -> L (Maybe PredicateDetails) +lookupPredicate schema (PidRef pid predId) + -- The temporary predicate used for derived fact generators is not + -- stored in the schema, so skip it. + | predId == tempPredicateId = return Nothing + | otherwise = + case lookupPid pid schema of + Nothing -> case lookupPredicateId predId schema of + Nothing -> + lintError $ "unknown predicate:" <+> displayDefault predId + Just details -> return (Just details) + Just details -> return (Just details) + +-- | Check that the type of a pattern matches an expected type. +-- When the pattern's type cannot be determined, the check is skipped. +checkEqType :: Doc () -> Pat -> Type -> L () +checkEqType ctx pat expected = + case patType pat of + Nothing -> return () + Just actual -> typesShouldMatch ctx actual expected + +typesShouldMatch :: Doc () -> Type -> Type -> L () +typesShouldMatch ctx actual expected + | eqType latestAngleVersion actual expected = return () + | otherwise = throwError $ vcat + [ "type mismatch in" <+> ctx <> ":" + , " expected:" <+> displayDefault expected + , " actual:" <+> displayDefault actual + ] + +-- --------------------------------------------------------------------------- +-- Inferring the type of a pattern + +-- | Attempt to determine the type of a pattern. Returns Nothing when +-- the type cannot be determined from the pattern structure alone +-- (e.g. an empty array or an Alt without surrounding context). +patType :: Pat -> Maybe Type +patType (Byte _) = Just ByteTy +patType (Nat _) = Just NatTy +patType (RTS.String _) = Just StringTy +patType (ByteArray _) = Just (ArrayTy ByteTy) +patType (Array []) = Nothing +patType (Array (t:_)) = ArrayTy <$> patType t +patType (Tuple ts) = tupleSchema <$> traverse patType ts +patType (Alt _ _) = Nothing +patType (Ref m) = matchType m + +matchType :: Match () Var -> Maybe Type +matchType (MatchWild ty) = Just ty +matchType (MatchNever ty) = Just ty +matchType (MatchBind v) = Just (varType v) +matchType (MatchVar v) = Just (varType v) +matchType (MatchFid _) = Nothing +matchType (MatchAnd a _) = patType a +matchType (MatchPrefix _ _) = Just StringTy +matchType (MatchArrayPrefix ty _ _) = Just (ArrayTy ty) +matchType (MatchExt _) = Nothing diff --git a/glean/db/Glean/Query/UserQuery.hs b/glean/db/Glean/Query/UserQuery.hs index 6b869bf90..dc79e29e6 100644 --- a/glean/db/Glean/Query/UserQuery.hs +++ b/glean/db/Glean/Query/UserQuery.hs @@ -74,6 +74,7 @@ import Glean.Query.Flatten import Glean.Query.Opt import Glean.Query.Reorder import Glean.Query.Incremental (makeIncremental) +import Glean.Query.Lint (lintQuery) import Glean.RTS as RTS import Glean.RTS.Bytecode.Disassemble import qualified Glean.RTS.Bytecode.Gen.Version as Bytecode @@ -987,6 +988,12 @@ compileAngleQuery rec ver dbSchema mode source stored debug = do reordered <- checkBadQuery id $ runExcept $ reorder dbSchema optimised ifDebug $ "reordered query: " <> show (displayDefault (qiQuery reordered)) + when (queryLint debug) $ + case lintQuery dbSchema reordered of + Left err -> throwIO $ Thrift.BadQuery $ + Text.pack $ show $ "query lint:" <+> err + Right () -> return () + final <- case mode of NoExtraSteps -> return reordered IncrementalDerivation getStats -> do diff --git a/glean/hs/Glean/RTS/Types.hs b/glean/hs/Glean/RTS/Types.hs index 2d865845a..e08a59b2a 100644 --- a/glean/hs/Glean/RTS/Types.hs +++ b/glean/hs/Glean/RTS/Types.hs @@ -143,6 +143,7 @@ eqType version a b = case (a,b) of (NatTy, NatTy) -> True (StringTy, StringTy) -> True (ArrayTy a, ArrayTy b) -> eqType version a b + (SetTy a, SetTy b) -> eqType version a b (RecordTy as, RecordTy bs) -> let isTuple = all (Text.isInfixOf "tuplefield" . fieldDefName) -- previous to version 7 records were always compared structurally diff --git a/glean/test/lib/Glean/Database/Test.hs b/glean/test/lib/Glean/Database/Test.hs index a2b6cdb7f..4921169fb 100644 --- a/glean/test/lib/Glean/Database/Test.hs +++ b/glean/test/lib/Glean/Database/Test.hs @@ -23,6 +23,7 @@ module Glean.Database.Test , setLMDBNoUnpack , enableTcDebug , enableQueryDebug + , enableQueryLint , enableRocksDBCache , withTestEnv , kickOffTestDB @@ -129,6 +130,10 @@ enableQueryDebug :: Setting enableQueryDebug cfg = cfg { cfgDebug = (cfgDebug cfg) { queryDebug = True } } +enableQueryLint :: Setting +enableQueryLint cfg = cfg + { cfgDebug = (cfgDebug cfg) { queryLint = True } } + enableRocksDBCache :: Setting enableRocksDBCache cfg = cfg { cfgServerConfig = (cfgServerConfig cfg) <&> \scfg -> scfg @@ -146,12 +151,12 @@ withTestEnv settings action = \(cfgAPI :: NullConfigProvider) -> do let dbConfig = foldl' (\acc f -> f acc) - def + (enableQueryLint def { cfgDataStore = tmpDataStore , cfgSchemaLocation = Just schemaLocationFiles , cfgServerConfig = ThriftSource.value def { ServerConfig.config_db_rocksdb_cache_mb = 0 } - } + }) settings withDatabases evb dbConfig cfgAPI action diff --git a/glean/website/docs/running.md b/glean/website/docs/running.md index 31844c8fe..d84645cdb 100644 --- a/glean/website/docs/running.md +++ b/glean/website/docs/running.md @@ -181,3 +181,6 @@ Enable debugging output for the Angle typechecker. * `--debug-query`
Enable debugging output for the Angle query compiler. + +* `--debug-query-lint`
+Enable consistency checks on the query before running; catches internal errors in the query compiler.