module UU_Scanner where
import Char
import List
import Maybe
import UU_BinaryTrees(tab2tree,btLocateIn)
import UU_Parsing(Symbol(..),IsParser,pSym,(<$>),pListSep,pPacked)
{- A parametrisable scanner
-
- Author: Doaitse Swierstra: doaitse@cs.uu.nl
and: Pablo Azero : pablo@cs.uu.nl
- Version 1.0 , May 25, 1998, SDS
first appearance on the software web site.
- Version 1.01, June 7, 1998, SDS
changed String recognition to recognise escaped characters
- Version 1.02, Aug 30, 1998, SDS
includes with unsafePerformIO
- Version 2.1, Jul 7, 1999, slightly different definition of token
ordering between tokens introduced
- Version 2.2, Jul 8, 1999, AG_Scanner and UU_Scanner merged
- Version 2.3, Jul 15, 1999, modifications: recognize decimal, octal and
- hexadecimal numbers; handles ' as part of a
- lower case identifier
- fixes: bug in msort (loops when passing an
- empty list)
- Version 2.4, Jul 23, 1999, additions: recognize characters and infix
- operators
-
- Lang. compat: Hugs 98 (because it is required by UU_Parsing)
- Version 2.5, Aug 15, 1999, changed names, pSym -> pSpec
, all parsers start with p....
- Version 2.6, Sept 15, 1999, changed error message for unterminated string
- Version 2.7, Sept 23, 1999, changed definition of pOper_Any
- Version 2.8 Aug 14, 2000, adapted to changes in search trees
- ?? Oct 25, 2000, adapted to use column numbers
- ?? Feb 2, 2001, incorporated changes of AD
- ?? Feb 28, 2001, tabs are handled correctly for column numbers
- ?? Mar 1, 2001, now generates space tokens that have to be filtered again
- ?? Apr 4, 2001, tabs are now handled relative to current column number
-}
data TokenType
= TkSymbol
| TkVarid
| TkConid
| TkKeyword
| TkOp
| TkString
| TkChar
| TkInteger8
| TkInteger10
| TkInteger16
| TkTextnm
| TkTextln
| TkSpace
| TkError
deriving (Eq, Ord)
type Line = Int
type Column = Int
data Pos = Pos{line:: !Line, column:: !Column}
type Filename = String
data Token = Tok { tp :: TokenType
, val1 :: String
, val2 :: String
, pos :: !Pos
, file :: !Filename
}
instance Eq Token where
(Tok ttypel stringl _ _ _ ) == (Tok ttyper stringr _ _ _) = ttypel == ttyper && stringl == stringr
instance Ord Token where
compare x y | x==y = EQ
| x<=y = LT
| otherwise = GT
(Tok ttypel stringl _ _ _ ) <= (Tok ttyper stringr _ _ _ )
= ttypel < ttyper
|| (ttypel == ttyper && stringl <= stringr)
maybeshow :: Pos -> Filename -> String
maybeshow (Pos 0 0) fn = ""
maybeshow (Pos l c) fn = " at line " ++ show l
++ ", column " ++ show c
++ " of file " ++ show fn
initPos :: Pos
initPos = Pos 1 1
noPos :: Pos
noPos = Pos 0 0
advl :: Line -> Pos ->Pos
advl i (Pos l c) = (Pos (l+i) 1)
advc :: Column -> Pos -> Pos
advc i (Pos l c) = (Pos l (c+i))
adv :: Pos -> Char -> Pos
adv pos c = case c of
'\t' -> advc (tabWidth (column pos)) pos
'\n' -> advl 1 pos
_ -> advc 1 pos
tabWidth :: Column -> Int
tabWidth c = 8 - ((c-1) `mod` 8)
instance Show Token where
showsPrec _ token
= showString
(case token of
(Tok TkSymbol _ s2 i fn) -> "symbol " ++ s2 ++ maybeshow i fn
(Tok TkOp _ s2 i fn) -> "operator " ++ s2 ++ maybeshow i fn
(Tok TkKeyword _ s2 i fn) -> show s2 ++ maybeshow i fn
(Tok TkString _ s2 i fn) -> "string \"" ++ s2 ++ "\"" ++ maybeshow i fn
(Tok TkChar _ s2 i fn) -> "character '" ++ s2 ++ "'" ++ maybeshow i fn
(Tok TkInteger8 _ s2 i fn) -> "octal integer " ++ s2 ++ maybeshow i fn
(Tok TkInteger10 _ s2 i fn) -> "decimal Integer " ++ s2 ++ maybeshow i fn
(Tok TkInteger16 _ s2 i fn) -> "hexadecimal integer " ++ s2 ++ maybeshow i fn
(Tok TkVarid _ s2 i fn) -> "lower case identifier " ++ s2 ++ maybeshow i fn
(Tok TkConid _ s2 i fn) -> "upper case identifier " ++ s2 ++ maybeshow i fn
(Tok TkTextnm _ s2 i fn) -> "text name " ++ s2 ++ maybeshow i fn
(Tok TkTextln _ s2 i fn) -> "text line " ++ s2 ++ maybeshow i fn
(Tok TkSpace _ s2 i fn) -> "spaces " ++ maybeshow i fn
(Tok TkError _ s2 i fn) -> "error in scanner: " ++ s2 ++ maybeshow i fn
)
instance Symbol Token where
deleteCost (Tok TkKeyword _ _ _ _) = 10
deleteCost _ = 5
keyToken,token :: TokenType -> String -> Pos -> Filename -> Token
keyToken tp key pos fn = Tok tp key key pos fn
token tp value pos fn = Tok tp "" value pos fn
errToken :: String -> Pos -> Filename -> Token
errToken = token TkError
skipline s = let (_,rest) = span (/='\n') s
in rest
scan :: [String] -> [String] -> String -> String -> String -> Pos -> String -> [Token]
scan keywordstxt keywordsops specchars opchars fn pos input
= doScan pos input
where
locatein :: Ord a => [a] -> a -> Bool
locatein es = isJust . btLocateIn compare (tab2tree (sort es))
iskw = locatein keywordstxt
isop = locatein keywordsops
isSymbol = locatein specchars
isOpsym = locatein opchars
isIdStart c = isLower c || c == '_'
isIdChar c = isAlphaNum c
|| c == '\''
|| c == '_'
scanIdent p s = let (name,rest) = span isIdChar s
in (name,advc (length name) p,rest)
doScan p [] = []
doScan p (c:s) | isSpace c = let (sp,next) = span isSpace s
in doScan (foldl adv p (c:sp)) next
doScan p ('-':'-':s) = doScan p (dropWhile (/= '\n') s)
doScan p ('{':'-':s) = lexNest fn doScan (advc 2 p) s
doScan p ('"':ss)
= let (s,swidth,rest) = scanString ss
in if null rest || head rest /= '"'
then errToken "Unterminated string literal" p fn : doScan (advc swidth p) rest
else token TkString s p fn : doScan (advc (swidth+2) p) (tail rest)
doScan p ('\'':ss)
= let (mc,cwidth,rest) = scanChar ss
in case mc of
Nothing -> errToken "Error in character literal" p fn : doScan (advc cwidth p) rest
Just c -> if null rest || head rest /= '\''
then errToken "Unterminated character literal" p fn : doScan (advc (cwidth+1) p) rest
else token TkChar [c] p fn : doScan (advc (cwidth+2) p) (tail rest)
{-
In Haskell infix identifiers consist of three separate tokens(two backquotes + identifier)
doScan p ('`':ss)
= case ss of
[] -> [errToken "Unterminated infix identifier" p fn]
(c:s) -> let res | isIdStart c || isUpper c =
let (name,p1,rest) = scanIdent (advc 2 p) s
ident = c:name
tokens | null rest ||
head rest /= '`' = errToken "Unterminated infix identifier" p fn
: doScan p1 rest
| iskw ident = errToken ("Keyword used as infix identifier: " ++ ident) p fn
: doScan (advc 1 p1) (tail rest)
| otherwise = token TkOp ident p fn
: doScan (advc 1 p1) (tail rest)
in tokens
| otherwise = errToken ("Unexpected character in infix identifier: " ++ show c) p fn
: doScan (adv p c) s
in res
-}
doScan p cs@(c:s)
| isSymbol c = keyToken TkSymbol [c] p fn
: doScan(advc 1 p) s
| isIdStart c || isUpper c
= let (name', p', s') = scanIdent (advc 1 p) s
name = c:name'
tok = if iskw name
then keyToken TkKeyword name p fn
else if null name' && isSymbol c
then keyToken TkSymbol [c] p fn
else token (if isIdStart c then TkVarid else TkConid) name p fn
in tok : doScan p' s'
| isOpsym c = let (name, s') = span isOpsym cs
tok | isop name = keyToken TkKeyword name p fn
| otherwise = token TkOp name p fn
in tok : doScan (foldl adv p name) s'
| isDigit c = let (tktype,number,width,s') = getNumber cs
in token tktype number p fn : doScan (advc width p) s'
| otherwise = errToken ("Unexpected character " ++ show c) p fn
: doScan (adv p c) s
{-
-- ks: no clean implementation of columns
readname s lc fn = (name,orest,nlc)
where (line,irest) = span (/='\n') s
orest = if null irest then "" else irest
nlc = if null irest then lc else (lc `advl` 1)
name = takename . dropWhile (\x -> not $ x `elem` "{[") $ line
takename ln | null ln = ""
| otherwise = if not (null tln) && (isAlpha . head $ tln)
then if not (null rln) && (head rln `elem` "}]")
then cname
else err lc fn 1
else err lc fn 1
where (cname, rln) = span validChar tln
tln = tail ln
validChar c = isAlpha c || c `elem` ".-_" || isDigit c
-- ks: changed definition from (lc+1) to (lc)
err lc fn 1 = error ("in scanner bad name definition" ++ maybeshow (lc) fn)
err lc fn 2
= error ("in scanner not a valid name in file inclusion" ++ maybeshow (lc) fn)
-}
lexNest fn cont pos inp = lexNest' cont pos inp
where lexNest' c p ('-':'}':s) = c (advc 2 p) s
lexNest' c p ('{':'-':s) = lexNest' (lexNest' c) (advc 2 p) s
lexNest' c p (x:s) = lexNest' c (adv p x) s
lexNest' _ _ [] = [ errToken "Unterminated nested comment" pos fn ]
scanString [] = ("",0,[])
scanString ('\\':'&':xs) = let (str,w,r) = scanString xs
in (str,w+2,r)
scanString ('\'':xs) = let (str,w,r) = scanString xs
in ('\'': str,w+1,r)
scanString xs = let (ch,cw,cr) = getchar xs
(str,w,r) = scanString cr
str' = maybe "" (:str) ch
in maybe ("",0,xs) (\c -> (c:str,cw+w,r)) ch
scanChar ('"' :xs) = (Just '"',1,xs)
scanChar xs = getchar xs
getchar [] = (Nothing,0,[])
getchar s@('\n':_ ) = (Nothing,0,s )
getchar s@('\t':_ ) = (Nothing,0,s)
getchar s@('\'':_ ) = (Nothing,0,s)
getchar s@('"' :_ ) = (Nothing,0,s)
getchar ('\\':xs) = let (c,l,r) = getEscChar xs
in (c,l+1,r)
getchar (x:xs) = (Just x,1,xs)
getEscChar [] = (Nothing,0,[])
getEscChar s@(x:xs) | isDigit x = let (tp,n,len,rest) = getNumber s
val = case tp of
TkInteger8 -> readn 8 n
TkInteger16 -> readn 16 n
TkInteger10 -> readn 10 n
in if val >= 0 && val <= 255
then (Just (chr val),len, rest)
else (Nothing,1,rest)
| otherwise = case x `lookup` cntrChars of
Nothing -> (Nothing,0,s)
Just c -> (Just c,1,xs)
where cntrChars = [('a','\a'),('b','\b'),('f','\f'),('n','\n'),('r','\r'),('t','\t')
,('v','\v'),('\\','\\'),('"','\"'),('\'','\'')]
readn base n = foldl (\r x -> value x + base * r) 0 n
getNumber cs@(c:s)
| c /= '0' = num10
| null s = const0
| hs == 'x' || hs == 'X' = num16
| hs == 'o' || hs == 'O' = num8
| otherwise = num10
where (hs:ts) = s
const0 = (TkInteger10, "0",1,s)
num10 = let (n,r) = span isDigit cs
in (TkInteger10,n,length n,r)
num16 = readNum isHexaDigit ts TkInteger16
num8 = readNum isOctalDigit ts TkInteger8
readNum p ts tk
= let nrs@(n,rs) = span p ts
in if null n then const0
else (tk , n, 2+length n,rs)
isHexaDigit d = isDigit d || (d >= 'A' && d <= 'F') || (d >= 'a' && d <= 'f')
isOctalDigit d = d >= '0' && d <= '7'
value c | isDigit c = ord c - ord '0'
| isUpper c = ord c - ord 'A' + 10
| isLower c = ord c - ord 'a' + 10
-------------------------------------------------------------------------
-- IsParsers for Symbols
-------------------------------------------------------------------------
get_tok_val (Tok _ _ s _ _) = s
gsym :: IsParser p Token => TokenType -> String -> String -> p String
gsym kind val val2 = get_tok_val <$> pSym (Tok kind val val2 noPos "")
pString, pChar, pInteger8, pInteger10, pInteger16, pVarid, pConid,
pTextnm, pTextln, pInteger :: IsParser p Token => p String
pOper name = gsym TkOp name name
pKey keyword = gsym TkKeyword keyword keyword
pSpec s = gsym TkSymbol [s] [s]
pString = gsym TkString "" ""
pChar = gsym TkChar "" "\NUL"
pInteger8 = gsym TkInteger8 "" "1"
pInteger10 = gsym TkInteger10 "" "1"
pInteger16 = gsym TkInteger16 "" "1"
pVarid = gsym TkVarid "" "?lc?"
pConid = gsym TkConid "" "?uc?"
pTextnm = gsym TkTextnm "" ""
pTextln = gsym TkTextln "" ""
pInteger = pInteger10
pComma, pSemi, pOParen, pCParen, pOBrack, pCBrack, pOCurly, pCCurly
:: IsParser p Token => p String
pComma = pSpec ','
pSemi = pSpec ';'
pOParen = pSpec '('
pCParen = pSpec ')'
pOBrack = pSpec '['
pCBrack = pSpec ']'
pOCurly = pSpec '{'
pCCurly = pSpec '}'
pCommas :: IsParser p Token => p a -> p [a]
pSemics :: IsParser p Token => p a -> p [a]
pParens :: IsParser p Token => p a -> p a
pBracks :: IsParser p Token => p a -> p a
pCurly :: IsParser p Token => p a -> p a
pCommas = pListSep pComma
pSemics = pListSep pSemi
pParens = pPacked pOParen pCParen
pBracks = pPacked pOBrack pCBrack
pCurly = pPacked pOCurly pCCurly
pParens_pCommas :: IsParser p Token => p a -> p [a]
pBracks_pCommas :: IsParser p Token => p a -> p [a]
pCurly_pSemics :: IsParser p Token => p a -> p [a]
pParens_pCommas = pParens.pCommas
pBracks_pCommas = pBracks.pCommas
pCurly_pSemics = pCurly .pSemics