-- |
-- Module      :  Text.Megaparsec.Char
-- Copyright   :  © 2015–present Megaparsec contributors
--                © 2007 Paolo Martini
--                © 1999–2001 Daan Leijen
-- License     :  FreeBSD
--
-- Maintainer  :  Mark Karpov <markkarpov92@gmail.com>
-- Stability   :  experimental
-- Portability :  non-portable
--
-- Commonly used character parsers.

{-# LANGUAGE FlexibleContexts    #-}
{-# LANGUAGE LambdaCase          #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TypeFamilies        #-}

module Text.Megaparsec.Char
  ( -- * Simple parsers
    newline
  , crlf
  , eol
  , tab
  , space
  , space1
    -- * Categories of characters
  , controlChar
  , spaceChar
  , upperChar
  , lowerChar
  , letterChar
  , alphaNumChar
  , printChar
  , digitChar
  , binDigitChar
  , octDigitChar
  , hexDigitChar
  , markChar
  , numberChar
  , punctuationChar
  , symbolChar
  , separatorChar
  , asciiChar
  , latin1Char
  , charCategory
  , categoryName
    -- * Single character
  , char
  , char'
    -- * Sequence of characters
  , string
  , string' )
where

import Control.Applicative
import Data.Char
import Data.Functor (void)
import Data.Proxy
import Text.Megaparsec
import Text.Megaparsec.Common

----------------------------------------------------------------------------
-- Simple parsers

-- | Parse a newline character.

newline :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
newline :: m (Token s)
newline = Token s -> m (Token s)
forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
Token s -> m (Token s)
char Token s
'\n'
{-# INLINE newline #-}

-- | Parse a carriage return character followed by a newline character.
-- Return the sequence of characters parsed.

crlf :: forall e s m. (MonadParsec e s m, Token s ~ Char) => m (Tokens s)
crlf :: m (Tokens s)
crlf = Tokens s -> m (Tokens s)
forall e s (m :: * -> *).
MonadParsec e s m =>
Tokens s -> m (Tokens s)
string (Proxy s -> [Token s] -> Tokens s
forall s. Stream s => Proxy s -> [Token s] -> Tokens s
tokensToChunk (Proxy s
forall k (t :: k). Proxy t
Proxy :: Proxy s) [Token s]
"\r\n")
{-# INLINE crlf #-}

-- | Parse a CRLF (see 'crlf') or LF (see 'newline') end of line. Return the
-- sequence of characters parsed.

eol :: forall e s m. (MonadParsec e s m, Token s ~ Char) => m (Tokens s)
eol :: m (Tokens s)
eol = (Proxy s -> Token s -> Tokens s
forall s. Stream s => Proxy s -> Token s -> Tokens s
tokenToChunk (Proxy s
forall k (t :: k). Proxy t
Proxy :: Proxy s) (Char -> Tokens s) -> m Char -> m (Tokens s)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> m Char
forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
m (Token s)
newline)
  m (Tokens s) -> m (Tokens s) -> m (Tokens s)
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> m (Tokens s)
forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
m (Tokens s)
crlf
  m (Tokens s) -> String -> m (Tokens s)
forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
<?> "end of line"
{-# INLINE eol #-}

-- | Parse a tab character.

tab :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
tab :: m (Token s)
tab = Token s -> m (Token s)
forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
Token s -> m (Token s)
char Token s
'\t'
{-# INLINE tab #-}

-- | Skip /zero/ or more white space characters.
--
-- See also: 'skipMany' and 'spaceChar'.

space :: (MonadParsec e s m, Token s ~ Char) => m ()
space :: m ()
space = m (Tokens s) -> m ()
forall (f :: * -> *) a. Functor f => f a -> f ()
void (m (Tokens s) -> m ()) -> m (Tokens s) -> m ()
forall a b. (a -> b) -> a -> b
$ Maybe String -> (Token s -> Bool) -> m (Tokens s)
forall e s (m :: * -> *).
MonadParsec e s m =>
Maybe String -> (Token s -> Bool) -> m (Tokens s)
takeWhileP (String -> Maybe String
forall a. a -> Maybe a
Just "white space") Char -> Bool
Token s -> Bool
isSpace
{-# INLINE space #-}

-- | Skip /one/ or more white space characters.
--
-- See also: 'skipSome' and 'spaceChar'.
--
-- @since 6.0.0

space1 :: (MonadParsec e s m, Token s ~ Char) => m ()
space1 :: m ()
space1 = m (Tokens s) -> m ()
forall (f :: * -> *) a. Functor f => f a -> f ()
void (m (Tokens s) -> m ()) -> m (Tokens s) -> m ()
forall a b. (a -> b) -> a -> b
$ Maybe String -> (Token s -> Bool) -> m (Tokens s)
forall e s (m :: * -> *).
MonadParsec e s m =>
Maybe String -> (Token s -> Bool) -> m (Tokens s)
takeWhile1P (String -> Maybe String
forall a. a -> Maybe a
Just "white space") Char -> Bool
Token s -> Bool
isSpace
{-# INLINE space1 #-}

----------------------------------------------------------------------------
-- Categories of characters

-- | Parse a control character (a non-printing character of the Latin-1
-- subset of Unicode).

controlChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
controlChar :: m (Token s)
controlChar = (Token s -> Bool) -> m (Token s)
forall e s (m :: * -> *).
MonadParsec e s m =>
(Token s -> Bool) -> m (Token s)
satisfy Char -> Bool
Token s -> Bool
isControl m Char -> String -> m Char
forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
<?> "control character"
{-# INLINE controlChar #-}

-- | Parse a Unicode space character, and the control characters: tab,
-- newline, carriage return, form feed, and vertical tab.

spaceChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
spaceChar :: m (Token s)
spaceChar = (Token s -> Bool) -> m (Token s)
forall e s (m :: * -> *).
MonadParsec e s m =>
(Token s -> Bool) -> m (Token s)
satisfy Char -> Bool
Token s -> Bool
isSpace m Char -> String -> m Char
forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
<?> "white space"
{-# INLINE spaceChar #-}

-- | Parse an upper-case or title-case alphabetic Unicode character. Title
-- case is used by a small number of letter ligatures like the
-- single-character form of Lj.

upperChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
upperChar :: m (Token s)
upperChar = (Token s -> Bool) -> m (Token s)
forall e s (m :: * -> *).
MonadParsec e s m =>
(Token s -> Bool) -> m (Token s)
satisfy Char -> Bool
Token s -> Bool
isUpper m Char -> String -> m Char
forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
<?> "uppercase letter"
{-# INLINE upperChar #-}

-- | Parse a lower-case alphabetic Unicode character.

lowerChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
lowerChar :: m (Token s)
lowerChar = (Token s -> Bool) -> m (Token s)
forall e s (m :: * -> *).
MonadParsec e s m =>
(Token s -> Bool) -> m (Token s)
satisfy Char -> Bool
Token s -> Bool
isLower m Char -> String -> m Char
forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
<?> "lowercase letter"
{-# INLINE lowerChar #-}

-- | Parse an alphabetic Unicode character: lower-case, upper-case, or
-- title-case letter, or a letter of case-less scripts\/modifier letter.

letterChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
letterChar :: m (Token s)
letterChar = (Token s -> Bool) -> m (Token s)
forall e s (m :: * -> *).
MonadParsec e s m =>
(Token s -> Bool) -> m (Token s)
satisfy Char -> Bool
Token s -> Bool
isLetter m Char -> String -> m Char
forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
<?> "letter"
{-# INLINE letterChar #-}

-- | Parse an alphabetic or numeric digit Unicode characters.
--
-- Note that the numeric digits outside the ASCII range are parsed by this
-- parser but not by 'digitChar'. Such digits may be part of identifiers but
-- are not used by the printer and reader to represent numbers.

alphaNumChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
alphaNumChar :: m (Token s)
alphaNumChar = (Token s -> Bool) -> m (Token s)
forall e s (m :: * -> *).
MonadParsec e s m =>
(Token s -> Bool) -> m (Token s)
satisfy Char -> Bool
Token s -> Bool
isAlphaNum m Char -> String -> m Char
forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
<?> "alphanumeric character"
{-# INLINE alphaNumChar #-}

-- | Parse a printable Unicode character: letter, number, mark, punctuation,
-- symbol or space.

printChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
printChar :: m (Token s)
printChar = (Token s -> Bool) -> m (Token s)
forall e s (m :: * -> *).
MonadParsec e s m =>
(Token s -> Bool) -> m (Token s)
satisfy Char -> Bool
Token s -> Bool
isPrint m Char -> String -> m Char
forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
<?> "printable character"
{-# INLINE printChar #-}

-- | Parse an ASCII digit, i.e between “0” and “9”.

digitChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
digitChar :: m (Token s)
digitChar = (Token s -> Bool) -> m (Token s)
forall e s (m :: * -> *).
MonadParsec e s m =>
(Token s -> Bool) -> m (Token s)
satisfy Char -> Bool
Token s -> Bool
isDigit m Char -> String -> m Char
forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
<?> "digit"
{-# INLINE digitChar #-}

-- | Parse a binary digit, i.e. "0" or "1".
--
-- @since 7.0.0

binDigitChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
binDigitChar :: m (Token s)
binDigitChar = (Token s -> Bool) -> m (Token s)
forall e s (m :: * -> *).
MonadParsec e s m =>
(Token s -> Bool) -> m (Token s)
satisfy Char -> Bool
Token s -> Bool
isBinDigit m Char -> String -> m Char
forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
<?> "binary digit"
  where
    isBinDigit :: Char -> Bool
isBinDigit x :: Char
x = Char
x Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== '0' Bool -> Bool -> Bool
|| Char
x Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== '1'
{-# INLINE binDigitChar #-}

-- | Parse an octal digit, i.e. between “0” and “7”.

octDigitChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
octDigitChar :: m (Token s)
octDigitChar = (Token s -> Bool) -> m (Token s)
forall e s (m :: * -> *).
MonadParsec e s m =>
(Token s -> Bool) -> m (Token s)
satisfy Char -> Bool
Token s -> Bool
isOctDigit m Char -> String -> m Char
forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
<?> "octal digit"
{-# INLINE octDigitChar #-}

-- | Parse a hexadecimal digit, i.e. between “0” and “9”, or “a” and “f”, or
-- “A” and “F”.

hexDigitChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
hexDigitChar :: m (Token s)
hexDigitChar = (Token s -> Bool) -> m (Token s)
forall e s (m :: * -> *).
MonadParsec e s m =>
(Token s -> Bool) -> m (Token s)
satisfy Char -> Bool
Token s -> Bool
isHexDigit m Char -> String -> m Char
forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
<?> "hexadecimal digit"
{-# INLINE hexDigitChar #-}

-- | Parse a Unicode mark character (accents and the like), which combines
-- with preceding characters.

markChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
markChar :: m (Token s)
markChar = (Token s -> Bool) -> m (Token s)
forall e s (m :: * -> *).
MonadParsec e s m =>
(Token s -> Bool) -> m (Token s)
satisfy Char -> Bool
Token s -> Bool
isMark m Char -> String -> m Char
forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
<?> "mark character"
{-# INLINE markChar #-}

-- | Parse a Unicode numeric character, including digits from various
-- scripts, Roman numerals, etc.

numberChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
numberChar :: m (Token s)
numberChar = (Token s -> Bool) -> m (Token s)
forall e s (m :: * -> *).
MonadParsec e s m =>
(Token s -> Bool) -> m (Token s)
satisfy Char -> Bool
Token s -> Bool
isNumber m Char -> String -> m Char
forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
<?> "numeric character"
{-# INLINE numberChar #-}

-- | Parse a Unicode punctuation character, including various kinds of
-- connectors, brackets and quotes.

punctuationChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
punctuationChar :: m (Token s)
punctuationChar = (Token s -> Bool) -> m (Token s)
forall e s (m :: * -> *).
MonadParsec e s m =>
(Token s -> Bool) -> m (Token s)
satisfy Char -> Bool
Token s -> Bool
isPunctuation m Char -> String -> m Char
forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
<?> "punctuation"
{-# INLINE punctuationChar #-}

-- | Parse a Unicode symbol characters, including mathematical and currency
-- symbols.

symbolChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
symbolChar :: m (Token s)
symbolChar = (Token s -> Bool) -> m (Token s)
forall e s (m :: * -> *).
MonadParsec e s m =>
(Token s -> Bool) -> m (Token s)
satisfy Char -> Bool
Token s -> Bool
isSymbol m Char -> String -> m Char
forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
<?> "symbol"
{-# INLINE symbolChar #-}

-- | Parse a Unicode space and separator characters.

separatorChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
separatorChar :: m (Token s)
separatorChar = (Token s -> Bool) -> m (Token s)
forall e s (m :: * -> *).
MonadParsec e s m =>
(Token s -> Bool) -> m (Token s)
satisfy Char -> Bool
Token s -> Bool
isSeparator m Char -> String -> m Char
forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
<?> "separator"
{-# INLINE separatorChar #-}

-- | Parse a character from the first 128 characters of the Unicode
-- character set, corresponding to the ASCII character set.

asciiChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
asciiChar :: m (Token s)
asciiChar = (Token s -> Bool) -> m (Token s)
forall e s (m :: * -> *).
MonadParsec e s m =>
(Token s -> Bool) -> m (Token s)
satisfy Char -> Bool
Token s -> Bool
isAscii m Char -> String -> m Char
forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
<?> "ASCII character"
{-# INLINE asciiChar #-}

-- | Parse a character from the first 256 characters of the Unicode
-- character set, corresponding to the ISO 8859-1 (Latin-1) character set.

latin1Char :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
latin1Char :: m (Token s)
latin1Char = (Token s -> Bool) -> m (Token s)
forall e s (m :: * -> *).
MonadParsec e s m =>
(Token s -> Bool) -> m (Token s)
satisfy Char -> Bool
Token s -> Bool
isLatin1 m Char -> String -> m Char
forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
<?> "Latin-1 character"
{-# INLINE latin1Char #-}

-- | @'charCategory' cat@ parses character in Unicode General Category
-- @cat@, see 'Data.Char.GeneralCategory'.

charCategory :: (MonadParsec e s m, Token s ~ Char)
  => GeneralCategory
  -> m (Token s)
charCategory :: GeneralCategory -> m (Token s)
charCategory cat :: GeneralCategory
cat = (Token s -> Bool) -> m (Token s)
forall e s (m :: * -> *).
MonadParsec e s m =>
(Token s -> Bool) -> m (Token s)
satisfy ((GeneralCategory -> GeneralCategory -> Bool
forall a. Eq a => a -> a -> Bool
== GeneralCategory
cat) (GeneralCategory -> Bool)
-> (Char -> GeneralCategory) -> Char -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> GeneralCategory
generalCategory) m Char -> String -> m Char
forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
<?> GeneralCategory -> String
categoryName GeneralCategory
cat
{-# INLINE charCategory #-}

-- | Return the human-readable name of Unicode General Category.

categoryName :: GeneralCategory -> String
categoryName :: GeneralCategory -> String
categoryName = \case
  UppercaseLetter      -> "uppercase letter"
  LowercaseLetter      -> "lowercase letter"
  TitlecaseLetter      -> "titlecase letter"
  ModifierLetter       -> "modifier letter"
  OtherLetter          -> "other letter"
  NonSpacingMark       -> "non-spacing mark"
  SpacingCombiningMark -> "spacing combining mark"
  EnclosingMark        -> "enclosing mark"
  DecimalNumber        -> "decimal number character"
  LetterNumber         -> "letter number character"
  OtherNumber          -> "other number character"
  ConnectorPunctuation -> "connector punctuation"
  DashPunctuation      -> "dash punctuation"
  OpenPunctuation      -> "open punctuation"
  ClosePunctuation     -> "close punctuation"
  InitialQuote         -> "initial quote"
  FinalQuote           -> "final quote"
  OtherPunctuation     -> "other punctuation"
  MathSymbol           -> "math symbol"
  CurrencySymbol       -> "currency symbol"
  ModifierSymbol       -> "modifier symbol"
  OtherSymbol          -> "other symbol"
  Space                -> "white space"
  LineSeparator        -> "line separator"
  ParagraphSeparator   -> "paragraph separator"
  Control              -> "control character"
  Format               -> "format character"
  Surrogate            -> "surrogate character"
  PrivateUse           -> "private-use Unicode character"
  NotAssigned          -> "non-assigned Unicode character"

----------------------------------------------------------------------------
-- Single character

-- | A type-constrained version of 'single'.
--
-- > semicolon = char ';'

char :: (MonadParsec e s m, Token s ~ Char) => Token s -> m (Token s)
char :: Token s -> m (Token s)
char = Token s -> m (Token s)
forall e s (m :: * -> *).
MonadParsec e s m =>
Token s -> m (Token s)
single
{-# INLINE char #-}

-- | The same as 'char' but case-insensitive. This parser returns the
-- actually parsed character preserving its case.
--
-- >>> parseTest (char' 'e') "E"
-- 'E'
-- >>> parseTest (char' 'e') "G"
-- 1:1:
-- unexpected 'G'
-- expecting 'E' or 'e'

char' :: (MonadParsec e s m, Token s ~ Char) => Token s -> m (Token s)
char' :: Token s -> m (Token s)
char' c :: Token s
c = [m Char] -> m Char
forall (f :: * -> *) (m :: * -> *) a.
(Foldable f, Alternative m) =>
f (m a) -> m a
choice
  [ Token s -> m (Token s)
forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
Token s -> m (Token s)
char (Char -> Char
toLower Char
Token s
c)
  , Token s -> m (Token s)
forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
Token s -> m (Token s)
char (Char -> Char
toUpper Char
Token s
c)
  , Token s -> m (Token s)
forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
Token s -> m (Token s)
char (Char -> Char
toTitle Char
Token s
c)
  ]
{-# INLINE char' #-}