module TextViewport.Render.Segmentation where import Data.DList qualified as DL import Data.HashMap.Strict qualified as HM import Data.Hashable (Hashable) import Data.List (minimumBy) import Data.MonoTraversable.Unprefixed qualified as O import Data.Ord (comparing) import Data.Sequences (Index, IsSequence, Textual) import Data.Sequences qualified as S import Data.Text (Text) import Data.Text qualified as T import Data.Vector (Vector) import Data.Vector qualified as V import Text.Hyphenation qualified as H import TextViewport.Render.RenderedLine (RenderedLine(..)) -- TODO qualified -- | User-pluggable segmentation interface class Segmenter seg a where applySeg :: seg -> Int -> Int -> a -> Vector (RenderedLine a) -- | Built-in segmentation strategies (backwards compatible) data BuiltinSeg a = NoSegments | FixedWidthSegments | HyphenateSegments { hsLang :: H.Language , hsCache :: HM.HashMap a [(a, a)] } deriving (Eq, Show) -- | Built-in instance instance (Hashable a, Textual a, Index a ~ Int) => Segmenter (BuiltinSeg a) a where applySeg = applyStrategy applyStrategy :: (Hashable a, Textual a, Index a ~ Int) => BuiltinSeg a -> Int -> Int -> a -> Vector (RenderedLine a) applyStrategy NoSegments width itemIx txt = let rawLines = S.splitWhen (=='\n') txt chunks = map (S.take width) rawLines -- crop offsets = scanOffsetsWithNewlines chunks in V.fromList [ RenderedLine { rlText = chunk, rlItemIx = itemIx, rlLineIx = lineIx, rlCharStart = off } | (lineIx, (off, chunk)) <- zip [0..] (zip offsets chunks) ] applyStrategy FixedWidthSegments width itemIx txt = let rawLines = S.splitWhen (=='\n') txt (dl, _) = foldl step (DL.empty, 0) rawLines step (acc, off0) line = let chunks = chunkFixed width line offsets = scanOffsetsFrom off0 chunks offNext = off0 + O.length line + 1 acc' = acc `DL.append` DL.fromList (zip offsets chunks) in (acc', offNext) allChunks = DL.toList dl in V.fromList [ RenderedLine { rlText = chunk, rlItemIx = itemIx, rlLineIx = lineIx, rlCharStart = off } | (lineIx, (off, chunk)) <- zip [0..] allChunks ] applyStrategy (HyphenateSegments lang cache0) width itemIx txt = let rawLines = S.splitWhen (=='\n') txt -- fold over each physical line, accumulating: -- * all rendered (offset, chunk) pairs -- * updated hyphenation cache (unused for now) -- * running character offset across lines (dl, _cache1, _) = foldl (segmentOneLine lang width) (DL.empty, cache0, 0) rawLines allChunks = DL.toList dl in V.fromList [ RenderedLine { rlText = chunk, rlItemIx = itemIx, rlLineIx = lineIx, rlCharStart = off } | (lineIx, (off, chunk)) <- zip [0..] allChunks ] where -- Segment a single physical line using TeX‑lite hyphenation segmentOneLine :: (Hashable t, Textual t, Index t ~ Int) => H.Language -> Int -> (DL.DList (Int, t), HM.HashMap t [(t, t)], Int) -> t -> (DL.DList (Int, t), HM.HashMap t [(t, t)], Int) segmentOneLine lang width (acc, cache, off0) line = let (chunks, cache1) = segmentWithHyphenationTeXLite lang width line cache offsets = scanOffsetsFrom off0 chunks offNext = off0 + O.length line + 1 acc' = acc `DL.append` DL.fromList (zip offsets chunks) in (acc', cache1, offNext) --segmentOneLine (acc, cache, off0) line = -- let chunks = segmentWithHyphenationTeXLite lang width line -- offsets = scanOffsetsFrom off0 chunks -- offNext = off0 + O.length line + 1 -- +1 for newline -- acc' = acc ++ zip offsets chunks -- in (acc', cache, offNext) -- | Compute running character offsets for a list of chunks. scanOffsetsWithNewlines :: IsSequence t => [t] -> [Int] scanOffsetsWithNewlines = go 0 where go !_ [] = [] go !o (l:ls) = let off = o o' = o + O.length l + 1 -- +1 for newline in off : go o' ls -- | Chunk a single line into fixed-width pieces. chunkFixed :: (IsSequence t, Index t ~ Int) => Int -> t -> [t] chunkFixed w t | w <= 0 = [] | O.null t = [mempty] | otherwise = let (h, rest) = S.splitAt w t in h : if O.null rest then [] else chunkFixed w rest -------------------------------------------------------------------------------- -- Hyphenation-aware segmenting (TeX-lite) -------------------------------------------------------------------------------- -- Compute offsets starting from a base offset scanOffsetsFrom :: IsSequence t => Int -> [t] -> [Int] scanOffsetsFrom start = go start where go !_ [] = [] go !o (t:ts) = o : go (o + O.length t) ts segmentWithHyphenationTeXLite :: (Hashable t, Textual t, Index t ~ Int) => H.Language -> Int -> t -> HM.HashMap t [(t, t)] -> ([t], HM.HashMap t [(t, t)]) segmentWithHyphenationTeXLite lang width txt cache0 = go cache0 (S.words txt) where go cache [] = ([], cache) go cache ws = case lineCandidates lang width cache ws of ([], cache1) -> let chunks = breakWordSafe width ws in (chunks, cache1) (cs, cache1) -> let (line, rest, _) = minimumBy (comparing (scoreCandidate width)) cs (more, cache2) = go cache1 rest in (line : more, cache2) -- | Lossless fallback: treat remaining words as one long text and -- chunk it into width-sized pieces. Never truncates, never drops text. breakWordSafe :: (Textual t, Index t ~ Int) => Int -> [t] -> [t] breakWordSafe width ws = chunk (S.unwords ws) where chunk t | O.null t = [] | O.length t <= width = [t] | otherwise = let (c, r) = S.splitAt width t in c : chunk r type Candidate a = (a, [a], Bool) lineCandidates :: (Hashable t, Textual t, Index t ~ Int) => H.Language -> Int -> HM.HashMap t [(t, t)] -> [t] -> ([(t, [t], Bool)], HM.HashMap t [(t, t)]) lineCandidates lang width cache0 ws0 = go mempty [] cache0 ws0 where go _ acc cache [] = (acc, cache) go line acc cache (w:ws) = let space = if O.null line then mempty else S.singleton ' ' baseTxt = S.unwords line wholeTxt = baseTxt <> space <> w wholeLen = O.length wholeTxt acc1 = if wholeLen <= width && not (O.null wholeTxt) then (wholeTxt, ws, False) : acc else acc (hyphs, cache1) = case HM.lookup w cache of Just hs -> (hs, cache) Nothing -> let hs = hyphenateWord lang (T.pack $ S.unpack w) hs' = [ (S.pack $ T.unpack pre, S.pack $ T.unpack suf) | (pre, suf) <- hs ] in (hs', HM.insert w hs' cache) hyphCands = [ (preTxt, suf : ws, True) | (pre, suf) <- hyphs , not (O.null pre) , let preTxt = baseTxt <> space <> pre <> S.singleton '-' , O.length preTxt <= width ] acc2 = hyphCands ++ acc1 in if wholeLen <= width then go (line ++ [w]) acc2 cache1 ws else (acc2, cache1) hyphenateWord :: H.Language -> Text -> [(Text, Text)] hyphenateWord lang word = let parts = H.hyphenate (H.languageHyphenator lang) (T.unpack word) in [ ( T.pack (concat (take i parts)) , T.pack (concat (drop i parts)) ) | i <- [1 .. length parts - 1] ] scoreCandidate :: IsSequence t => Int -> Candidate t -> Int scoreCandidate width (line, _, endsWithHyphen) = let len = O.length line remSpace = max 0 (width - len) badness = remSpace * remSpace * remSpace hyphenPenalty = if endsWithHyphen then 50 else 0 shortPenalty = if len < width `div` 2 then 200 else 0 in badness + hyphenPenalty + shortPenalty