diff options
author | polwex <polwex@sortug.com> | 2025-06-03 15:41:31 +0700 |
---|---|---|
committer | polwex <polwex@sortug.com> | 2025-06-03 15:41:31 +0700 |
commit | 175ddca375cef765cec8ca5bbc527a205c40bf25 (patch) | |
tree | f2e47a5d85e4d5e0297613e5a17cebce7d09b09b | |
parent | 2401217a4019938d1c1cc61b6e33ccb233eb6e74 (diff) |
preeeeettty much done FUCK YES
-rw-r--r-- | src/actions/tones.ts | 10 | ||||
-rw-r--r-- | src/components/tones/ToneSelectorClient.tsx | 405 | ||||
-rw-r--r-- | src/components/ui/select.tsx | 42 | ||||
-rw-r--r-- | src/lib/calls/nlp.ts | 1 | ||||
-rw-r--r-- | src/lib/db/prosodydb.ts | 215 | ||||
-rw-r--r-- | src/lib/db/prosodyschema.sql | 98 | ||||
-rw-r--r-- | src/lib/db/thaiseed.ts | 253 | ||||
-rw-r--r-- | src/lib/db/thaiseedold.ts | 301 | ||||
-rw-r--r-- | src/lib/types/phonetics.ts | 4 | ||||
-rw-r--r-- | src/lib/utils.ts | 5 | ||||
-rw-r--r-- | src/pages/api/tts.ts | 81 | ||||
-rw-r--r-- | src/pages/tones.tsx | 6 |
12 files changed, 1109 insertions, 312 deletions
diff --git a/src/actions/tones.ts b/src/actions/tones.ts index 0f28612..7d9cb34 100644 --- a/src/actions/tones.ts +++ b/src/actions/tones.ts @@ -1,6 +1,9 @@ "use server"; +import db from "@/lib/db"; import pdb from "@/lib/db/prosodydb"; +import { MutationOrder } from "@/lib/types/phonetics"; +import { randomFromArray } from "@/lib/utils"; import { WordData } from "@/zoom/logic/types"; // Helper to extract tone from prosody - assuming prosody is an array of objects like [{tone: number}, ...] @@ -10,6 +13,13 @@ const getTonesFromProsody = (prosody: any): number[] | null => { } return null; }; + +export async function mutateToneSelection( + order: MutationOrder, // Array of tones, one for each syllable. null means any tone. +) { + const res = pdb.fetchWordsByToneSylsWords(order); + return res; +} export async function fetchWordsByToneAndSyllables( tones: (string | null)[], // Array of tones, one for each syllable. null means any tone. ) { diff --git a/src/components/tones/ToneSelectorClient.tsx b/src/components/tones/ToneSelectorClient.tsx index 0ee9433..8a0327c 100644 --- a/src/components/tones/ToneSelectorClient.tsx +++ b/src/components/tones/ToneSelectorClient.tsx @@ -1,52 +1,240 @@ -'use client'; +"use client"; -import { useState, useEffect, useTransition } from 'react'; -import { WordData } from '@/zoom/logic/types'; -import { fetchWordsByToneAndSyllables } from '@/actions/tones'; -import { Button } from '@/components/ui/button'; -import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from '@/components/ui/select'; -import { Card, CardContent, CardDescription, CardFooter, CardHeader, CardTitle } from '@/components/ui/card'; -import { Label } from '@/components/ui/label'; -import { Skeleton } from '@/components/ui/skeleton'; // For loading state +import { useState, useEffect, useTransition, useRef } from "react"; +import { WordData } from "@/zoom/logic/types"; +import { + fetchWordsByToneAndSyllables, + mutateToneSelection, +} from "@/actions/tones"; +import { Button } from "@/components/ui/button"; +import { + Select, + SelectContent, + SelectItem, + SelectTrigger, + SelectValue, +} from "@/components/ui/select"; +import { + Card, + CardContent, + CardDescription, + CardFooter, + CardHeader, + CardTitle, +} from "@/components/ui/card"; +import { Label } from "@/components/ui/label"; +import { Skeleton } from "@/components/ui/skeleton"; // For loading state +import { MutationOrder, ToneQuery } from "@/lib/types/phonetics"; +import { ProsodySyllable } from "@/lib/types/cards"; +import { ArrowLeft, ArrowRight, Loader2, Volume2 } from "lucide-react"; +function getColorByTone(tone: string): string { + if (tone === "mid") return "blue"; + if (tone === "low") return "green"; + if (tone === "falling") return "gold"; + if (tone === "high") return "purple"; + if (tone === "rising") return "black"; + else return "black"; +} // Helper to display tones prominently -const ProminentToneDisplay = ({ wordData }: { wordData: WordData }) => { - if (!wordData.prosody || !Array.isArray(wordData.prosody)) { - return <p className="text-gray-500">No prosody data</p>; +const ProminentToneDisplay = ({ word }: { word: any }) => { + const tones: string[] = word.tone_sequence.split(","); + const syls: string[] = word.syl_seq.split(","); + const [isPending, startTransition] = useTransition(); + function mutateWord(idx: number) { + console.log("changing", idx); + const mutationOrder: MutationOrder = syls.map((s, i) => { + if (idx === i) return { change: tones[idx]! }; + else return { keep: syls[i]! }; + }); + console.log("hey hey", word); + startTransition(async () => { + const words = await mutateToneSelection(mutationOrder); + console.log({ words }); + // setCurrentWord(word); + }); + } + // playing audio + // const sourceRef = useRef<AudioBufferSourceNode>(null); + const audioRef = useRef<HTMLAudioElement>(null); + + async function playAudio() { + // setLoading(true); + // const audioContext = new (window.AudioContext || + // (window as any).webkitAudioContext)(); + // const response = await fetch(audioUrl); + // const arrayBuffer = await response.arrayBuffer(); + // const audioBuffer = await audioContext.decodeAudioData(arrayBuffer); + // if (audioContext && audioBuffer) { + // setLoading(false); + // const source = audioContext.createBufferSource(); + // source.buffer = audioBuffer; + // source.connect(audioContext.destination); + // source.start(); + // sourceRef.current = source; + // } + const res = await fetch(`/api/tts?word=${word.spelling}&lang=thai`); + const audioBlob = await res.blob(); + const audioURL = URL.createObjectURL(audioBlob); + if (audioRef.current) { + audioRef.current.src = audioURL; + audioRef.current.play(); + } } return ( <div className="flex flex-col items-center mb-4"> - <h1 className="text-6xl font-bold text-blue-600 mb-2">{wordData.spelling}</h1> - <div className="flex space-x-4"> - {wordData.prosody.map((p, index) => ( - <div key={index} className="text-center"> - <p className="text-sm text-gray-500">Syllable {index + 1}</p> - <p className="text-5xl font-semibold text-indigo-500">{p.tone ?? '?'}</p> - </div> + <h1 className="text-6xl font-bold mb-2"> + {syls.map((syl: string, idx: number) => ( + <span + key={syl + idx} + onClick={() => mutateWord(idx)} + style={{ color: getColorByTone(tones[idx]!) }} + className="cursor-pointer hover:text-gray-700" + > + {syl} + </span> ))} + </h1> + <div className="mt-4 space-x-4"> + <p className="ipa text-xl text-gray-700 mt-2">{word.ipa}</p> + <button + className="p-1 text-blue-500 hover:text-blue-700 transition-colors" + title="Pronounce" + onClick={playAudio} + > + <Volume2 size={20} /> + </button> + {isPending && <Loader2 />} + <audio ref={audioRef} /> + <p className="ipa text-xl text-gray-700 mt-2">{word.frequency}</p> + <p className="ipa text-xl text-gray-700 mt-2">{word.word_id}</p> </div> - {wordData.ipa && wordData.ipa.length > 0 && ( - <p className="text-xl text-gray-700 mt-2"> - {wordData.ipa.map(i => i.ipa).join(' / ')} - </p> - )} </div> ); }; +export default function ToneSelectorClient({ + initialData, + initialTones, +}: { + initialData: any[]; + initialTones: ToneQuery; +}) { + const [data, setData] = useState<any[]>(initialData); + const [currentIdx, setCurrentIdx] = useState(0); + const [isLoading, startTransition] = useTransition(); + const [selectedTones, setTones] = useState<ToneQuery>(initialTones); + + function goPrev() { + setCurrentIdx((i) => (i === 0 ? 0 : i - 1)); + } + function goNext() { + setCurrentIdx((i) => (i === data.length - 1 ? data.length - 1 : i + 1)); + } + + const handleFetch = () => { + startTransition(async () => { + const words = await fetchWordsByToneAndSyllables(selectedTones); + setData(words); + }); + }; -export default function ToneSelectorClient({ initialWord }: { initialWord: WordData | null }) { - const [currentWord, setCurrentWord] = useState<WordData | null>(initialWord); - const [syllableCount, setSyllableCount] = useState<number>(initialWord?.syllables || 1); - const [selectedTones, setSelectedTones] = useState<(number | null)[]>( - initialWord?.prosody?.map(p => p.tone ?? null) || [null] + return ( + <div className="container mx-auto p-4 max-w-2xl"> + <ToneForm + isLoading={isLoading} + handleFetch={handleFetch} + selectedTones={selectedTones} + setTones={setTones} + /> + + <Inner + isLoading={isLoading} + currentWord={data[currentIdx]} + goPrev={goPrev} + goNext={goNext} + /> + </div> ); - const [isLoading, startTransition] = useTransition(); +} +type IProps = { + isLoading: boolean; + currentWord: any; + goPrev: () => void; + goNext: () => void; +}; +function Inner({ isLoading, currentWord, goPrev, goNext }: IProps) { + return isLoading ? ( + <Card> + <CardHeader> + <Skeleton className="h-12 w-3/4" /> + </CardHeader> + <CardContent className="space-y-4"> + <Skeleton className="h-8 w-1/2" /> + <Skeleton className="h-20 w-full" /> + <Skeleton className="h-6 w-full" /> + </CardContent> + </Card> + ) : currentWord ? ( + <Card> + <CardHeader> + <CardTitle className="text-center">Current Word</CardTitle> + </CardHeader> + <CardContent> + <ProminentToneDisplay word={currentWord} /> + {/* You can add more details from WordData here if needed, like definitions */} + </CardContent> + <CardFooter className="justify-between"> + <ArrowLeft onClick={goPrev} /> + <ArrowRight onClick={goNext} /> + </CardFooter> + </Card> + ) : ( + <Card> + <CardHeader> + <CardTitle className="text-center">No Word Found</CardTitle> + </CardHeader> + <CardContent> + <p className="text-center text-gray-600"> + Could not find a Thai word matching your criteria. Try different + selections. + </p> + </CardContent> + </Card> + ); +} + +type ToneFormProps = { + isLoading: boolean; + handleFetch: (tones: ToneQuery) => void; + selectedTones: ToneQuery; + setTones: React.Dispatch<React.SetStateAction<ToneQuery>>; +}; +function ToneForm({ + selectedTones, + setTones, + isLoading, + handleFetch, +}: ToneFormProps) { + const thaiTones = [ + { value: "mid", label: "1 (Mid)" }, + { value: "low", label: "2 (Low)" }, + { value: "falling", label: "3 (Falling)" }, + { value: "high", label: "4 (High)" }, + { value: "rising", label: "5 (Rising)" }, + ]; + const [syllableCount, setSyllableCount] = useState<number>(2); + function decrSyl() { + setSyllableCount((s) => (s <= 1 ? 1 : s - 1)); + } + function incrSyl() { + setSyllableCount((s) => (s >= 5 ? 5 : s + 1)); + } useEffect(() => { // Adjust selectedTones array length when syllableCount changes - setSelectedTones(prevTones => { + setTones((prevTones) => { const newTones = Array(syllableCount).fill(null); for (let i = 0; i < Math.min(prevTones.length, syllableCount); i++) { newTones[i] = prevTones[i]; @@ -55,79 +243,51 @@ export default function ToneSelectorClient({ initialWord }: { initialWord: WordD }); }, [syllableCount]); - const handleFetchWord = () => { - startTransition(async () => { - const word = await fetchWordsByToneAndSyllables(syllableCount, selectedTones); - setCurrentWord(word); - }); - }; - const handleSyllableCountChange = (value: string) => { const count = parseInt(value, 10); - if (!isNaN(count) && count > 0 && count <= 5) { // Max 5 syllables for simplicity + if (!isNaN(count) && count > 0 && count <= 5) { + // Max 5 syllables for simplicity setSyllableCount(count); } }; const handleToneChange = (syllableIndex: number, value: string) => { - const tone = value === 'any' ? null : parseInt(value, 10); - setSelectedTones(prevTones => { + const tone = value === "any" ? null : value; + setTones((prevTones) => { const newTones = [...prevTones]; newTones[syllableIndex] = tone; return newTones; }); }; - - const thaiTones = [ - { value: '1', label: '1 (Mid)' }, - { value: '2', label: '2 (Low)' }, - { value: '3', label: '3 (Falling)' }, - { value: '4', label: '4 (High)' }, - { value: '5', label: '5 (Rising)' }, - ]; return ( - <div className="container mx-auto p-4 max-w-2xl"> - <Card className="mb-6"> - <CardHeader> - <CardTitle>Thai Tone Explorer</CardTitle> - <CardDescription>Select syllable count and tones to find Thai words.</CardDescription> - </CardHeader> - <CardContent className="space-y-6"> - <div> - <Label htmlFor="syllable-count" className="text-lg font-medium">Number of Syllables</Label> - <Select - value={syllableCount.toString()} - onValueChange={handleSyllableCountChange} - > - <SelectTrigger id="syllable-count" className="w-full md:w-1/2 mt-1"> - <SelectValue placeholder="Select number of syllables" /> - </SelectTrigger> - <SelectContent> - {[1, 2, 3, 4, 5].map(num => ( - <SelectItem key={num} value={num.toString()}> - {num} Syllable{num > 1 ? 's' : ''} - </SelectItem> - ))} - </SelectContent> - </Select> - </div> - + <Card className="mb-6"> + <CardHeader> + <CardTitle>Thai Tone Explorer</CardTitle> + <CardDescription> + Select syllable count and tones to find Thai words. + </CardDescription> + </CardHeader> + <CardContent className="space-y-6"> + <div className="flex gap-10 justify-center"> {Array.from({ length: syllableCount }).map((_, index) => ( - <div key={index}> - <Label htmlFor={`tone-select-${index}`} className="text-lg font-medium"> - Tone for Syllable {index + 1} - </Label> + <div key={index} className="w-fit"> <Select - value={selectedTones[index]?.toString() || 'any'} + value={selectedTones[index]?.toString() || "any"} onValueChange={(value) => handleToneChange(index, value)} > - <SelectTrigger id={`tone-select-${index}`} className="w-full md:w-1/2 mt-1"> - <SelectValue placeholder={`Select tone for syllable ${index + 1}`} /> + <SelectTrigger + id={`tone-select-${index}`} + className="w-full md:w-full mt-1" + > + <SelectValue + className="w-full" + placeholder={`Select tone for syllable ${index + 1}`} + /> </SelectTrigger> - <SelectContent> + <SelectContent className="lolol md:w-full bg-white w-full"> <SelectItem value="any">Any Tone</SelectItem> - {thaiTones.map(tone => ( + {thaiTones.map((tone) => ( <SelectItem key={tone.value} value={tone.value}> {tone.label} </SelectItem> @@ -136,64 +296,23 @@ export default function ToneSelectorClient({ initialWord }: { initialWord: WordD </Select> </div> ))} - </CardContent> - <CardFooter> - <Button onClick={handleFetchWord} disabled={isLoading} className="w-full md:w-auto"> - {isLoading ? 'Searching...' : 'Find Word'} - </Button> - </CardFooter> - </Card> - - {isLoading && !currentWord && ( - <Card> - <CardHeader><Skeleton className="h-12 w-3/4" /></CardHeader> - <CardContent className="space-y-4"> - <Skeleton className="h-8 w-1/2" /> - <Skeleton className="h-20 w-full" /> - <Skeleton className="h-6 w-full" /> - </CardContent> - </Card> - )} - - {!isLoading && currentWord && ( - <Card> - <CardHeader> - <CardTitle className="text-center">Current Word</CardTitle> - </CardHeader> - <CardContent> - <ProminentToneDisplay wordData={currentWord} /> - {/* You can add more details from WordData here if needed, like definitions */} - {currentWord.senses && currentWord.senses.length > 0 && ( - <div className="mt-4 pt-4 border-t"> - <h3 className="text-lg font-semibold mb-2">Meanings:</h3> - {currentWord.senses.map((sense, sIdx) => ( - <div key={sIdx} className="mb-2 p-2 border rounded bg-gray-50"> - <p className="font-medium text-indigo-600">{sense.pos}</p> - {sense.senses && Array.isArray(sense.senses) && sense.senses.map((subSense, ssIdx) => ( - subSense.glosses && Array.isArray(subSense.glosses) && subSense.glosses.map((gloss: string, gIdx: number) => ( - <p key={`${ssIdx}-${gIdx}`} className="text-sm text-gray-700 ml-2">- {gloss}</p> - )) - ))} - </div> - ))} - </div> - )} - </CardContent> - </Card> - )} - - {!isLoading && !currentWord && ( - <Card> - <CardHeader> - <CardTitle className="text-center">No Word Found</CardTitle> - </CardHeader> - <CardContent> - <p className="text-center text-gray-600"> - Could not find a Thai word matching your criteria. Try different selections. - </p> - </CardContent> - </Card> - )} - </div> + </div> + </CardContent> + <CardFooter className="justify-center gap-18"> + <Button className="" onClick={decrSyl}> + - + </Button> + <Button + onClick={() => handleFetch(selectedTones)} + disabled={isLoading} + className="w-full md:w-auto" + > + {isLoading ? "Searching..." : "Fetch"} + </Button> + <Button className="" onClick={incrSyl}> + + + </Button> + </CardFooter> + </Card> ); } diff --git a/src/components/ui/select.tsx b/src/components/ui/select.tsx index b624a5b..23e7161 100644 --- a/src/components/ui/select.tsx +++ b/src/components/ui/select.tsx @@ -1,25 +1,25 @@ -import * as React from "react" -import * as SelectPrimitive from "@radix-ui/react-select" -import { CheckIcon, ChevronDownIcon, ChevronUpIcon } from "lucide-react" +import * as React from "react"; +import * as SelectPrimitive from "@radix-ui/react-select"; +import { CheckIcon, ChevronDownIcon, ChevronUpIcon } from "lucide-react"; -import { cn } from "@/lib/utils" +import { cn } from "@/lib/utils"; function Select({ ...props }: React.ComponentProps<typeof SelectPrimitive.Root>) { - return <SelectPrimitive.Root data-slot="select" {...props} /> + return <SelectPrimitive.Root data-slot="select" {...props} />; } function SelectGroup({ ...props }: React.ComponentProps<typeof SelectPrimitive.Group>) { - return <SelectPrimitive.Group data-slot="select-group" {...props} /> + return <SelectPrimitive.Group data-slot="select-group" {...props} />; } function SelectValue({ ...props }: React.ComponentProps<typeof SelectPrimitive.Value>) { - return <SelectPrimitive.Value data-slot="select-value" {...props} /> + return <SelectPrimitive.Value data-slot="select-value" {...props} />; } function SelectTrigger({ @@ -32,7 +32,7 @@ function SelectTrigger({ data-slot="select-trigger" className={cn( "border-input data-[placeholder]:text-muted-foreground aria-invalid:border-destructive ring-ring/10 dark:ring-ring/20 dark:outline-ring/40 outline-ring/50 [&_svg:not([class*='text-'])]:text-muted-foreground flex h-9 w-full items-center justify-between rounded-md border bg-transparent px-3 py-2 text-sm shadow-xs transition-[color,box-shadow] focus-visible:ring-4 focus-visible:outline-1 disabled:cursor-not-allowed disabled:opacity-50 aria-invalid:focus-visible:ring-0 *:data-[slot=select-value]:flex *:data-[slot=select-value]:items-center *:data-[slot=select-value]:gap-2 [&_svg]:pointer-events-none [&_svg]:shrink-0 [&_svg:not([class*='size-'])]:size-4 [&>span]:line-clamp-1", - className + className, )} {...props} > @@ -41,7 +41,7 @@ function SelectTrigger({ <ChevronDownIcon className="size-4 opacity-50" /> </SelectPrimitive.Icon> </SelectPrimitive.Trigger> - ) + ); } function SelectContent({ @@ -58,7 +58,7 @@ function SelectContent({ "bg-popover text-popover-foreground data-[state=open]:animate-in data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=open]:fade-in-0 data-[state=closed]:zoom-out-95 data-[state=open]:zoom-in-95 data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-right-2 data-[side=right]:slide-in-from-left-2 data-[side=top]:slide-in-from-bottom-2 relative z-50 max-h-96 min-w-[8rem] overflow-hidden rounded-md border shadow-md", position === "popper" && "data-[side=bottom]:translate-y-1 data-[side=left]:-translate-x-1 data-[side=right]:translate-x-1 data-[side=top]:-translate-y-1", - className + className, )} position={position} {...props} @@ -68,7 +68,7 @@ function SelectContent({ className={cn( "p-1", position === "popper" && - "h-[var(--radix-select-trigger-height)] w-full min-w-[var(--radix-select-trigger-width)] scroll-my-1" + "h-[var(--radix-select-trigger-height)] w-full min-w-[var(--radix-select-trigger-width)] scroll-my-1", )} > {children} @@ -76,7 +76,7 @@ function SelectContent({ <SelectScrollDownButton /> </SelectPrimitive.Content> </SelectPrimitive.Portal> - ) + ); } function SelectLabel({ @@ -89,7 +89,7 @@ function SelectLabel({ className={cn("px-2 py-1.5 text-sm font-semibold", className)} {...props} /> - ) + ); } function SelectItem({ @@ -102,7 +102,7 @@ function SelectItem({ data-slot="select-item" className={cn( "focus:bg-accent focus:text-accent-foreground [&_svg:not([class*='text-'])]:text-muted-foreground relative flex w-full cursor-default items-center gap-2 rounded-sm py-1.5 pr-8 pl-2 text-sm outline-hidden select-none data-[disabled]:pointer-events-none data-[disabled]:opacity-50 [&_svg]:pointer-events-none [&_svg]:shrink-0 [&_svg:not([class*='size-'])]:size-4 *:[span]:last:flex *:[span]:last:items-center *:[span]:last:gap-2", - className + className, )} {...props} > @@ -113,7 +113,7 @@ function SelectItem({ </span> <SelectPrimitive.ItemText>{children}</SelectPrimitive.ItemText> </SelectPrimitive.Item> - ) + ); } function SelectSeparator({ @@ -126,7 +126,7 @@ function SelectSeparator({ className={cn("bg-border pointer-events-none -mx-1 my-1 h-px", className)} {...props} /> - ) + ); } function SelectScrollUpButton({ @@ -138,13 +138,13 @@ function SelectScrollUpButton({ data-slot="select-scroll-up-button" className={cn( "flex cursor-default items-center justify-center py-1", - className + className, )} {...props} > <ChevronUpIcon className="size-4" /> </SelectPrimitive.ScrollUpButton> - ) + ); } function SelectScrollDownButton({ @@ -156,13 +156,13 @@ function SelectScrollDownButton({ data-slot="select-scroll-down-button" className={cn( "flex cursor-default items-center justify-center py-1", - className + className, )} {...props} > <ChevronDownIcon className="size-4" /> </SelectPrimitive.ScrollDownButton> - ) + ); } export { @@ -176,4 +176,4 @@ export { SelectSeparator, SelectTrigger, SelectValue, -} +}; diff --git a/src/lib/calls/nlp.ts b/src/lib/calls/nlp.ts index f19c976..1e84e93 100644 --- a/src/lib/calls/nlp.ts +++ b/src/lib/calls/nlp.ts @@ -1,4 +1,5 @@ import { SyllableRes } from "../types/cards"; +import { randomFromArray } from "../utils"; export type ThaiNLPRes = { word: string; diff --git a/src/lib/db/prosodydb.ts b/src/lib/db/prosodydb.ts index d6da389..7c067d2 100644 --- a/src/lib/db/prosodydb.ts +++ b/src/lib/db/prosodydb.ts @@ -1,5 +1,5 @@ import Database from "bun:sqlite"; -import { Phoneme, Tone } from "../types/phonetics"; +import { MutationOrder, Phoneme, Tone } from "../types/phonetics"; import { ProsodyWord, ProsodyWordDB } from "../types/cards"; type Str = string | null; type ItemType = "word" | "syllable" | "idiom"; @@ -113,6 +113,7 @@ class DatabaseHandler { w.spelling, wp.ipa, w.frequency, + GROUP_CONCAT(s.text ORDER BY sw.idx) as syl_seq, GROUP_CONCAT(t.name ORDER BY sw.idx) as tone_sequence, COUNT(sw.syl_id) as syllable_count FROM words w @@ -127,17 +128,166 @@ class DatabaseHandler { spelling, ipa, frequency, + syl_seq, tone_sequence, syllable_count FROM word_tone_sequences WHERE tone_sequence LIKE ? AND syllable_count = ? - ORDER BY frequency DESC NULLS LAST; + ORDER BY frequency ASC NULLS LAST; `, ); return query.all(toneString.slice(1), tones.length) as any[]; } + // fetchWordsByToneAndSyls(tones: Array<string | null>) { + // const toneString = tones.reduce((acc: string, item) => { + // if (!item) return `${acc},%`; + // else return `${acc},${item}`; + // }, ""); + // console.log({ toneString }); + // const query = this.db.query( + // ` + // WITH word_tone_sequences AS ( + // SELECT + // w.id as word_id, + // w.spelling, + // wp.ipa, + // w.frequency, + // GROUP_CONCAT(s.text ORDER BY sw.idx) as syl_seq, + // GROUP_CONCAT(t.name ORDER BY sw.idx) as tone_sequence, + // COUNT(sw.syl_id) as syllable_count + // FROM words w + // JOIN word_phonetics wp ON w.id = wp.word_id + // JOIN syllables_words sw ON w.id = sw.word_id + // JOIN syllables s ON sw.syl_id = s.id + // JOIN tones t ON s.tone = t.id + // GROUP BY w.id, w.spelling, w.lang, w.frequency + // ) + // SELECT + // word_id, + // spelling, + // ipa, + // frequency, + // syl_seq, + // tone_sequence, + // syllable_count + // FROM word_tone_sequences + // WHERE tone_sequence LIKE ? + // AND syllable_count = ? + // ORDER BY frequency DESC NULLS LAST; + // `, + // ); + // return query.all(toneString.slice(1), tones.length) as any[]; + // } + fetchWordsByToneSylsWords(order: MutationOrder) { + console.log({ order }); + type Acc = { tones: string; syls: string }; + const strings = order.reduce( + (acc: Acc, item, idx) => { + const startString = idx === 0 ? "" : ","; + if ("change" in item) + return { + tones: `${acc.tones}${startString}${item.change}`, + syls: `${acc.syls}${startString}%`, + }; + else + return { + tones: `${acc.tones}${startString}%`, + syls: `${acc.syls}${startString}${item.keep}`, + }; + }, + { tones: "", syls: "" }, + ); + const query = this.db.query(` + SELECT + w.id as word_id, + w.spelling, + w.lang, + w.frequency, + wp.ipa, + wp.syllable_sequence, + wp.tone_sequence, + wp.ipa_sequence, + GROUP_CONCAT(s.text ORDER BY sw.idx) as syllable_pattern, + GROUP_CONCAT(t.name ORDER BY sw.idx) as tone_pattern + FROM words w + JOIN syllables_words sw ON w.id = sw.word_id + JOIN syllables s ON sw.syl_id = s.id + JOIN tones t ON s.tone = t.id + JOIN word_phonetics wp ON wp.word_id= w.id + WHERE wp.syllable_sequence LIKE ?1 + AND tone_sequence LIKE ?2 + AND syllable_count = ?3 + GROUP BY w.id, w.spelling, w.lang, w.frequency + ORDER BY w.frequency ASC NULLS LAST; `); + return query.all(strings.syls, strings.tones, order.length) as any[]; + } // inserts + superAdd(p: { + word: string; + lang: string; + frequency: number | null; + wordNotes: Str; + phonetics: Array<{ + ipa: string; + syllable_count: number; + syllable_sequence: string; + tone_sequence: string; + ipa_sequence: string; + tags: Str; + notes: Str; + wordRhyme: Str; + syllables: Array<{ + idx: number; + stressed: boolean | null; + spelling: string; + ipa: string; + long: boolean; + onset: Phoneme; + medial: Phoneme; + nucleus: Phoneme; + coda: Phoneme; + rhyme: Phoneme; + tone: Tone; + notes: Str; + }>; + }>; + }) { + const tx = this.db.transaction(() => { + const wordId = this.addWord(p.word, p.lang, p.frequency, p.wordNotes); + for (const ph of p.phonetics) { + this.addPronunciation( + wordId, + ph.ipa, + ph.syllable_count, + ph.syllable_sequence, + ph.tone_sequence, + ph.ipa_sequence, + ph.tags, + ph.notes, + ); + for (const syl of ph.syllables) { + this.addSyllable( + wordId, + syl.idx, + syl.stressed, + p.lang, + syl.ipa, + syl.long, + syl.spelling, + syl.onset, + syl.medial, + syl.nucleus, + syl.coda, + syl.rhyme, + syl.tone, + syl.notes, + ); + } + } + }); + tx(); + } addLanguage(code: string, name: string) { const query = this.db @@ -147,15 +297,44 @@ class DatabaseHandler { addPronunciation( wordId: number | bigint, ipa: string, - syllables: number, + syllable_count: number, + syllable_sequence: string, + tone_sequence: string, + ipa_sequence: string, tags: Str, notes: Str, ) { + console.log({ + wordId, + ipa, + syllable_count, + syllable_sequence, + tone_sequence, + ipa_sequence, + }); const query = this.db .query( - `INSERT OR IGNORE INTO word_phonetics(word_id,ipa, syllables, tag, notes) VALUES(?, ?, ?, ?, ?)`, + `INSERT OR IGNORE INTO word_phonetics( + word_id, + ipa, + syllable_count, + syllable_sequence, + tone_sequence, + ipa_sequence, + tag, + notes) + VALUES(?, ?, ?, ?, ?, ?, ?, ?)`, ) - .run(wordId, ipa, syllables, tags, notes); + .run( + wordId, + ipa, + syllable_count, + syllable_sequence, + tone_sequence, + ipa_sequence, + tags, + notes, + ); } addWordRhyme(wordId: number | bigint, ipa: string, lang: string, notes: Str) { const query = this.db @@ -212,12 +391,14 @@ class DatabaseHandler { notes: Str, ) { const query = this.db.query( - `INSERT OR IGNORE INTO words(spelling, lang, frequency, notes) VALUES(?, ?, ?, ?)`, - // `INSERT INTO words(spelling, lang) VALUES(?, ?)`, + `INSERT INTO words(spelling, lang, frequency, notes) VALUES(?, ?, ?, ?) + ON CONFLICT(spelling, lang) DO UPDATE SET + lang = excluded.lang + RETURNING rowid + `, ); - const res = query.run(spelling, lang, frequency, notes); - const wordId = res.lastInsertRowid; - return wordId; + const res = query.get(spelling, lang, frequency, notes) as { id: number }; + return res.id; } addSyllable( wordId: number | bigint, @@ -292,9 +473,15 @@ class DatabaseHandler { .get(tone.letters, lang, tone.name, tone.numbers) as { id: number }; const query = this.db.query( - `INSERT INTO syllables(lang, ipa, long, text, onset, medial, nucleus, coda, rhyme, tone, notes) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + `INSERT INTO syllables( + lang, ipa, long, text, onset, medial, nucleus, coda, rhyme, tone, notes) + VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ON CONFLICT(text, ipa, lang) DO UPDATE SET + lang = excluded.lang + RETURNING rowid + `, ); - const res = query.run( + const res = query.get( lang, ipa, long, @@ -306,8 +493,8 @@ class DatabaseHandler { rhymeId.id, toneId.id, notes, - ); - const sylId = res.lastInsertRowid; + ) as { id: number }; + const sylId = res.id; // const res1 = this.db .query( diff --git a/src/lib/db/prosodyschema.sql b/src/lib/db/prosodyschema.sql index c6a04fa..5554a02 100644 --- a/src/lib/db/prosodyschema.sql +++ b/src/lib/db/prosodyschema.sql @@ -150,9 +150,103 @@ CREATE TABLE IF NOT EXISTS word_phonetics( id INTEGER PRIMARY KEY AUTOINCREMENT, word_id INTEGER NOT NULL, ipa TEXT NOT NULL, - syllables INTEGER NOT NULL, + syllable_count INTEGER NOT NULL, + syllable_sequence TEXT NOT NULL, -- "家,鄉" + tone_sequence TEXT NOT NULL, -- "rising,rising" + ipa_sequence TEXT NOT NULL, -- IPA representation tag TEXT, notes TEXT, - CONSTRAINT ipa_unique UNIQUE (ipa, word_id) + FOREIGN KEY (word_id) REFERENCES words(id) ); CREATE INDEX IF NOT EXISTS idx_words_ipa ON word_phonetics(ipa, word_id); + +-- -- Query 2: Even simpler with pattern table +-- -- Pattern [{ change: "rising" }, { change: "falling" }] - any 2-syllable word with rising,falling tones +-- SELECT +-- w.spelling, +-- w.frequency, +-- wp.syllable_sequence, +-- wp.tone_sequence +-- FROM words w +-- JOIN word_patterns wp ON w.id = wp.word_id +-- WHERE wp.syllable_count = 2 +-- AND wp.tone_sequence = 'rising,falling' +-- ORDER BY w.frequency DESC NULLS LAST; + +-- -- Query 3: Mixed pattern [{ keep: "家" }, { change: "falling" }, { keep: "人" }] +-- SELECT DISTINCT +-- w.spelling, +-- w.frequency, +-- wp.syllable_sequence, +-- wp.tone_sequence +-- FROM words w +-- JOIN word_patterns wp ON w.id = wp.word_id +-- WHERE wp.syllable_count = 3 +-- AND wp.syllable_sequence LIKE '家,%,人' -- Simple pattern matching +-- AND EXISTS ( +-- SELECT 1 FROM word_syllable_positions wsp +-- WHERE wsp.word_id = w.id +-- AND wsp.position = 1 +-- AND wsp.tone_name = 'falling' +-- ) +-- ORDER BY w.frequency DESC NULLS LAST; + +-- -- Query 4: Super fast rhyme finding +-- -- Find all words that end with same syllable as "家鄉" (end with "鄉") +-- SELECT +-- w.spelling, +-- w.frequency, +-- wp.syllable_sequence +-- FROM words w +-- JOIN word_patterns wp ON w.id = wp.word_id +-- WHERE wp.syllable_sequence LIKE '%,鄉' -- Ends with 鄉 +-- AND wp.syllable_count >= 2 +-- ORDER BY w.frequency DESC NULLS LAST; + + + + +-- SELECT +-- w.id as word_id, +-- w.spelling, +-- w.lang, +-- w.frequency +-- FROM words w +-- JOIN word_phonetics wp ON wp.word_id= w.id +-- WHERE wp.syllable_sequence LIKE '%,ใจ' +-- AND wp.tone_sequence LIKE 'rising,%' +-- AND wp.syllable_count = 2 +-- GROUP BY w.id, w.spelling, w.lang, w.frequency +-- ORDER BY w.frequency DESC NULLS LAST; +-- +-- Indexes for fast pattern matching +CREATE INDEX IF NOT EXISTS idx_word_patterns_syllables ON word_phonetics(syllable_sequence); +CREATE INDEX IF NOT EXISTS idx_word_patterns_tones ON word_phonetics(tone_sequence); +CREATE INDEX IF NOT EXISTS idx_word_patterns_count ON word_phonetics(syllable_count); +CREATE INDEX IF NOT EXISTS idx_word_patterns_mixed ON word_phonetics(syllable_count, syllable_sequence, tone_sequence); + + +CREATE INDEX IF NOT EXISTS idx_syllables_words_word_idx ON syllables_words(word_id, idx); +CREATE INDEX IF NOT EXISTS idx_syllables_words_idx_word ON syllables_words(idx, word_id); +CREATE INDEX IF NOT EXISTS idx_syllables_words_syl ON syllables_words(syl_id); + +-- 2. Syllables table indexes for text and language lookups +CREATE INDEX IF NOT EXISTS idx_syllables_text_lang ON syllables(text, lang); +CREATE INDEX IF NOT EXISTS idx_syllables_lang_text ON syllables(lang, text); +CREATE INDEX IF NOT EXISTS idx_syllables_tone ON syllables(tone); +CREATE INDEX IF NOT EXISTS idx_syllables_text_tone ON syllables(text, tone); + +-- 3. Tones table indexes +CREATE INDEX IF NOT EXISTS idx_tones_name_lang ON tones(name, lang); +CREATE INDEX IF NOT EXISTS idx_tones_nums_lang ON tones(nums, lang); +CREATE INDEX IF NOT EXISTS idx_tones_lang_name ON tones(lang, name); + +-- 4. Words table indexes +CREATE INDEX IF NOT EXISTS idx_words_lang_freq ON words(lang, frequency DESC); +CREATE INDEX IF NOT EXISTS idx_words_id_lang ON words(id, lang); + +-- 5. Composite indexes for common query patterns +CREATE INDEX IF NOT EXISTS idx_syllables_compound ON syllables(lang, text, tone); +CREATE INDEX IF NOT EXISTS idx_syllables_words_compound ON syllables_words(word_id, idx, syl_id); + + diff --git a/src/lib/db/thaiseed.ts b/src/lib/db/thaiseed.ts index 6c69d9c..32434da 100644 --- a/src/lib/db/thaiseed.ts +++ b/src/lib/db/thaiseed.ts @@ -11,7 +11,7 @@ import { import pdb from "./prosodydb"; import { cleanIpa } from "../utils"; import { handleFile } from "./utils"; -import { Tone } from "../types/phonetics"; +import { Phoneme, Tone } from "../types/phonetics"; import { AsyncRes } from "../types"; async function readDump(lang: string) { @@ -25,7 +25,7 @@ async function readDump(lang: string) { // langrows = langrows.slice(10); for (const langrow of langrows) { count++; - // console.log(count); + console.log(count); // if (count <= 10000) continue; // if (count > 100) break; const j = JSON.parse(langrow.data); @@ -68,65 +68,101 @@ async function readDump(lang: string) { async function handleWord(word: string, j: any): AsyncRes<string> { // TODO add categories but add a tag to see what classifying scheme we're using // - const sounds = j.sounds || []; - const hasIpa = sounds.find((s: any) => "ipa" in s); - if (!hasIpa) return { error: "meh no ipa" }; - const freq = await getThaiFreq(word); - const wordId = pdb.addWord(word, "th", freq, null); - if (wordId == 478 || word === "และ") { - console.log("wtf man"); - console.dir(j, { depth: null }); - return { error: "i said wtf" }; - } + const frequency = await getThaiFreq(word); const analyzed = await analyzeTHWord(word); - for (let snd of sounds) - if ("ipa" in snd) { - const res = await handleIpa(wordId, j, snd, analyzed); - if ("error" in res) return res; - } + const phonetics = await Promise.all(getIpa(j, analyzed)); + + pdb.superAdd({ word, lang: "th", frequency, wordNotes: null, phonetics }); return { ok: "" }; } -async function handleIpa( - wordId: number | bigint, - j: any, - snd: any, - analyzed: ThaiNLPRes, -): AsyncRes<string> { +function getIpa(j: any, analyzed: ThaiNLPRes) { + const sounds = j.sounds || []; + const hasIpa = sounds.find((s: any) => "ipa" in s); + if (!hasIpa) return []; + const ipaData: Promise<IPAData>[] = sounds.reduce( + async (acc: Promise<IPAData>[], snd: any) => { + if ("ipa" in snd) { + const data = getIpaData(snd, analyzed); + return [...acc, data]; + } else return acc; + }, + [], + ); + return ipaData; +} +type IPAData = { + ipa: string; + syllable_count: number; + syllable_sequence: string; + tone_sequence: string; + ipa_sequence: string; + tags: string | null; + notes: string | null; + wordRhyme: string | null; + syllables: SylData[]; +}; +async function getIpaData(snd: any, analyzed: ThaiNLPRes): Promise<IPAData> { const tags = JSON.stringify(snd.tags) || null; // console.log("handleipa", analyzed.syllables.length); // console.log(analyzed); const wikiIpa = cleanIpa(snd.ipa); const nlpIpa = cleanIpa(analyzed.ipa); const ipa = wikiIpa || nlpIpa; - if (j.word === "และ") { - console.log("wtf!!"); - return { error: "wtf is this" }; - } + // if (j.word === "และ") { + // console.log("wtf!!"); + // return { error: "wtf is this" }; + // } const wikiIpaSplit = wikiIpa.split("."); const nlpIpaSplit = nlpIpa.split("."); if (wikiIpaSplit.length !== nlpIpaSplit.length) { - // console.log("ipa mismatch"); - // console.log(wikiIpa); - // console.log(nlpIpa); + console.log("ipa mismatch"); + console.log(wikiIpa); + console.log(nlpIpa); } if (analyzed.realSyls.length !== wikiIpaSplit.length) { - // console.log("syllable analysis mismatch", j.word); - // console.log({ syls: analyzed.syllables, ipa: wikiIpaSplit }); - // console.dir(j, { depth: null }); - return { error: "meh syllable analysis mismatch" }; + console.log("syllable analysis mismatch", analyzed.word); + console.log({ syls: analyzed.syllables, ipa: wikiIpaSplit }); + throw new Error("syllable mismatch"); } const writtenSyls = analyzed.syllables; - const pronouncedSyls = analyzed.realSyls; + const pronouncedSyls = analyzed.realSyls.map((s) => + s.replace(/\u{E3A}/u, ""), + ); + + const tone_sequence = wikiIpaSplit + .map((s) => parseTone(s, analyzed.word)) + .map((t) => t.name) + .join(","); + const syllable_sequence = pronouncedSyls.join(","); + const ipa_sequence = wikiIpaSplit.join(","); + const syllables = await Promise.all( + getSyllables(writtenSyls, pronouncedSyls, wikiIpaSplit), + ); + return { + ipa, + syllable_count: pronouncedSyls.length, + syllable_sequence, + tone_sequence, + ipa_sequence, + tags, + notes: null, + wordRhyme: null, + syllables, + }; +} +function getSyllables( + writtenSyls: string[], + pronouncedSyls: string[], + ipaSyls: string[], +) { let badSyls = false; if (writtenSyls.length !== pronouncedSyls.length) badSyls = true; - - pdb.addPronunciation(wordId, ipa, pronouncedSyls.length, tags, null); - + let syls: Promise<SylData>[] = []; for (let i = 0; i < pronouncedSyls.length; i++) { - const pronounced = pronouncedSyls[i]!.replace(/\u{E3A}/u, ""); + const pronounced = pronouncedSyls[i]!; const written = writtenSyls[i] || ""; const syllable = badSyls ? pronounced : written; - const ipa = wikiIpaSplit[i]!; + const ipa = ipaSyls[i]!; // TODO insert both?? const notes = pronounced === written ? null : `Pronounced ${pronounced}`; if (pronounced !== syllable) { @@ -134,10 +170,10 @@ async function handleIpa( console.log(pronounced); console.log(written); } - const res = await handleSyllable(syllable, ipa, wordId, i, notes); - if ("error" in res) return res; + const res = getSyllable(syllable, ipa, i, notes); + syls.push(res); } - return { ok: "" }; + return syls; } const thaiTones: Record<string, string> = { "˧": "mid", @@ -153,8 +189,22 @@ const thaiToneNums: Record<string, number> = { "˦˥": 45, "˩˩˦": 214, }; +const toneRegex = new RegExp(Object.keys(thaiToneNums).join("|")); + function parseTone(ipa: string, spelling: string): Tone { try { + const match = ipa.match(toneRegex)!; + const m = match[0]!; + const name = thaiTones[m]!; + const numbers = thaiToneNums[m]!; + return { letters: ipa, name, numbers }; + } catch (e) { + console.error("meh wrong tones!!", { s: spelling, ipa }); + throw new Error(""); + } +} +function parseToneS(ipa: string, spelling: string): Tone { + try { const name = thaiTones[ipa]!; const numbers = thaiToneNums[ipa]!; return { letters: ipa, name, numbers }; @@ -164,71 +214,44 @@ function parseTone(ipa: string, spelling: string): Tone { } } -async function handleSyllable( +type SylData = { + idx: number; + stressed: boolean | null; + spelling: string; + ipa: string; + long: boolean; + onset: Phoneme; + medial: Phoneme; + nucleus: Phoneme; + coda: Phoneme; + rhyme: Phoneme; + tone: Tone; + notes: string | null; +}; +async function getSyllable( spelling: string, ipa: string, - wordId: number | bigint, idx: number, notes: string | null, -): AsyncRes<string> { +): Promise<SylData> { const sorsyl = await sorSyl(spelling, "th", ipa); - const weird = [ - // "a̯n", - // "a̯", - // "a̯p", - // "a̯w", - // "a̯j", - // "a̯ŋ", - // "a̯k", - // "a̯t", - // "a̯m", - // "a̯ʔ", - // "ʔ", - "s", - "l", - "f", - "a̯s", - "js", - "t͡ɕʰ", - "ks", - "ns", - "a̯l", - "a̯f", - "mk", - ]; - // const weirder = sorsyl.syls.find((s) => weird.includes(s.coda)); - // if (weirder) { - // console.log("syllable", spelling); - // // console.dir(sorsyl, { depth: null }); - // // console.dir(j, { depth: null }); - // } if (sorsyl.syls.length !== 1) throw new Error("wtf sorsyl!"); const syl = sorsyl.syls[0]!.ipa; - const tone = parseTone(syl.tone, spelling); - // TODO add actual ortographic data here not just ipa - try { - pdb.addSyllable( - wordId, - idx + 1, - null, - "th", - syl.all, - syl.long, - spelling, - { spelling: syl.onset, ipa: syl.onset }, - { spelling: syl.medial, ipa: syl.medial }, - { spelling: syl.nucleus, ipa: syl.nucleus }, - { spelling: syl.coda, ipa: syl.coda }, - { spelling: syl.rhyme, ipa: syl.rhyme }, - tone, - notes, - ); - return { ok: "" }; - } catch (e) { - // console.log("well fuck", syl); - // console.error(e); - return { error: `meh ${e}` }; - } + const tone = parseToneS(syl.tone, spelling); + return { + idx: idx + 1, + stressed: null, + spelling, + ipa: syl.all, + long: syl.long, + onset: { spelling: syl.onset, ipa: syl.onset }, + medial: { spelling: syl.medial, ipa: syl.medial }, + nucleus: { spelling: syl.nucleus, ipa: syl.nucleus }, + coda: { spelling: syl.coda, ipa: syl.coda }, + rhyme: { spelling: syl.rhyme, ipa: syl.rhyme }, + tone, + notes, + }; } async function handleIdiom(idiom: string): AsyncRes<string> { pdb.addIdiom(idiom, "th"); @@ -236,33 +259,5 @@ async function handleIdiom(idiom: string): AsyncRes<string> { // console.log(); return { ok: "" }; } -// ช้า ๆ -// งก ๆ -// หงก ๆ - -async function getFrequency() { - const files = [ - "/home/y/code/prosody/resources/langdata/thai/data/1yin_freq.csv", - "/home/y/code/prosody/resources/langdata/thai/data/2yin_freq.csv", - "/home/y/code/prosody/resources/langdata/thai/data/3yin_freq.csv", - "/home/y/code/prosody/resources/langdata/thai/data/4yin_freq.csv", - "/home/y/code/prosody/resources/langdata/thai/data/5yin_freq.csv", - "/home/y/code/prosody/resources/langdata/thai/data/6yin_freq.csv", - ]; - const freqMap = new Map<number, string>(); - for (const file of files) { - await handleFile(file, (line, idx) => { - const [spelling, IPA, tone, length, frequency, ...rest] = line.split(","); - freqMap.set(Number(frequency!), spelling!); - }); - } - const orderedMap = new Map<string, number>(); - const keys = Array.from(freqMap.keys()).sort(); - for (let i = 0; i < keys.length; i++) { - const val = freqMap.get(keys[i]!)!; - orderedMap.set(val, i + 1); - } - return orderedMap; -} readDump("th"); diff --git a/src/lib/db/thaiseedold.ts b/src/lib/db/thaiseedold.ts new file mode 100644 index 0000000..b9522dd --- /dev/null +++ b/src/lib/db/thaiseedold.ts @@ -0,0 +1,301 @@ +import Database from "bun:sqlite"; +import { + analyzeTHWord, + deconstructSyllable, + segmentateThai, + type SorSyl, + type ThaiNLPRes, + sorSyl, + getThaiFreq, +} from "../calls/nlp"; +import pdb from "./prosodydb"; +import { cleanIpa } from "../utils"; +import { handleFile } from "./utils"; +import { Tone } from "../types/phonetics"; +import { AsyncRes } from "../types"; + +async function readDump(lang: string) { + await pdb.init(); + pdb.addLanguage("th", "thai"); + let count = 0; + const langdb = new Database( + `/home/y/code/prosody/resources/wiktionary/${lang}.db`, + ); + let langrows: any = langdb.query("SELECT data FROM langs"); + // langrows = langrows.slice(10); + for (const langrow of langrows) { + count++; + console.log(count); + // if (count <= 10000) continue; + // if (count > 100) break; + const j = JSON.parse(langrow.data); + const word = j.word.trim(); + if (!word) continue; + + if (word.includes("ๆ")) { + const res = await handleWord(word, j); + if ("error" in res) { + if (res.error.includes("meh")) continue; + if (res.error.includes("wtf")) { + console.error(res.error); + console.error(j.sounds); + } + break; + } + } else { + const split = word.split(" "); + if (split.length > 1) { + const res = await handleIdiom(word); + if ("error" in res) { + console.error(res.error); + break; + } + } else { + const res = await handleWord(word, j); + if ("error" in res) { + if (res.error.includes("meh")) continue; + if (res.error.includes("wtf")) { + console.error(res.error); + console.error(j.sounds); + } + // break; + } + } + } + } +} + +// if (wordId == 478 || word === "และ") { +// // console.log("wtf man"); +// // console.dir(j, { depth: null }); +// // return { error: "i said wtf" }; +// } +async function handleWord(word: string, j: any): AsyncRes<string> { + // TODO add categories but add a tag to see what classifying scheme we're using + // + const sounds = j.sounds || []; + const hasIpa = sounds.find((s: any) => "ipa" in s); + if (!hasIpa) return { error: "meh no ipa" }; + const freq = await getThaiFreq(word); + const wordId = pdb.addWord(word, "th", freq, null); + const analyzed = await analyzeTHWord(word); + for (let snd of sounds) + if ("ipa" in snd) { + const res = await handleIpa(wordId, j, snd, analyzed); + if ("error" in res) return res; + } + return { ok: "" }; +} +async function handleIpa( + wordId: number | bigint, + j: any, + snd: any, + analyzed: ThaiNLPRes, +): AsyncRes<string> { + console.log(); + const tags = JSON.stringify(snd.tags) || null; + // console.log("handleipa", analyzed.syllables.length); + // console.log(analyzed); + const wikiIpa = cleanIpa(snd.ipa); + const nlpIpa = cleanIpa(analyzed.ipa); + const ipa = wikiIpa || nlpIpa; + // if (j.word === "และ") { + // console.log("wtf!!"); + // return { error: "wtf is this" }; + // } + const wikiIpaSplit = wikiIpa.split("."); + const nlpIpaSplit = nlpIpa.split("."); + if (wikiIpaSplit.length !== nlpIpaSplit.length) { + // console.log("ipa mismatch"); + // console.log(wikiIpa); + // console.log(nlpIpa); + } + if (analyzed.realSyls.length !== wikiIpaSplit.length) { + // console.log("syllable analysis mismatch", j.word); + // console.log({ syls: analyzed.syllables, ipa: wikiIpaSplit }); + // console.dir(j, { depth: null }); + return { error: "meh syllable analysis mismatch" }; + } + const writtenSyls = analyzed.syllables; + const pronouncedSyls = analyzed.realSyls.map((s) => + s.replace(/\u{E3A}/u, ""), + ); + let badSyls = false; + if (writtenSyls.length !== pronouncedSyls.length) badSyls = true; + + const tone_sequence = wikiIpaSplit + .map((s) => parseTone(s, j.word)) + .map((t) => t.name) + .join(","); + const syl_sequence = pronouncedSyls.join(","); + const ipa_sequence = wikiIpaSplit.join(","); + pdb.addPronunciation( + wordId, + ipa, + pronouncedSyls.length, + syl_sequence, + tone_sequence, + ipa_sequence, + tags, + null, + ); + + for (let i = 0; i < pronouncedSyls.length; i++) { + const pronounced = pronouncedSyls[i]!; + const written = writtenSyls[i] || ""; + const syllable = badSyls ? pronounced : written; + const ipa = wikiIpaSplit[i]!; + // TODO insert both?? + const notes = pronounced === written ? null : `Pronounced ${pronounced}`; + if (pronounced !== syllable) { + console.log("diff"); + console.log(pronounced); + console.log(written); + } + const res = await handleSyllable(syllable, ipa, wordId, i, notes); + if ("error" in res) return res; + } + return { ok: "" }; +} +const thaiTones: Record<string, string> = { + "˧": "mid", + "˨˩": "low", + "˥˩": "falling", + "˦˥": "high", + "˩˩˦": "rising", +}; +const thaiToneNums: Record<string, number> = { + "˧": 33, + "˨˩": 21, + "˥˩": 41, + "˦˥": 45, + "˩˩˦": 214, +}; +const toneRegex = new RegExp(Object.keys(thaiToneNums).join("|")); + +function parseTone(ipa: string, spelling: string): Tone { + try { + const match = ipa.match(toneRegex)!; + const m = match[0]!; + const name = thaiTones[m]!; + const numbers = thaiToneNums[m]!; + return { letters: ipa, name, numbers }; + } catch (e) { + console.error("meh wrong tones!!", { s: spelling, ipa }); + throw new Error(""); + } +} +function parseToneS(ipa: string, spelling: string): Tone { + try { + const name = thaiTones[ipa]!; + const numbers = thaiToneNums[ipa]!; + return { letters: ipa, name, numbers }; + } catch (e) { + console.error("meh wrong tones!!", { s: spelling, ipa }); + throw new Error(""); + } +} + +async function handleSyllable( + spelling: string, + ipa: string, + wordId: number | bigint, + idx: number, + notes: string | null, +): AsyncRes<string> { + const sorsyl = await sorSyl(spelling, "th", ipa); + // console.log("ssyl", sorsyl.syls); + const weird = [ + // "a̯n", + // "a̯", + // "a̯p", + // "a̯w", + // "a̯j", + // "a̯ŋ", + // "a̯k", + // "a̯t", + // "a̯m", + // "a̯ʔ", + // "ʔ", + "s", + "l", + "f", + "a̯s", + "js", + "t͡ɕʰ", + "ks", + "ns", + "a̯l", + "a̯f", + "mk", + ]; + // const weirder = sorsyl.syls.find((s) => weird.includes(s.coda)); + // if (weirder) { + // console.log("syllable", spelling); + // // console.dir(sorsyl, { depth: null }); + // // console.dir(j, { depth: null }); + // } + if (sorsyl.syls.length !== 1) throw new Error("wtf sorsyl!"); + const syl = sorsyl.syls[0]!.ipa; + const tone = parseToneS(syl.tone, spelling); + // TODO add actual ortographic data here not just ipa + try { + pdb.addSyllable( + wordId, + idx + 1, + null, + "th", + syl.all, + syl.long, + spelling, + { spelling: syl.onset, ipa: syl.onset }, + { spelling: syl.medial, ipa: syl.medial }, + { spelling: syl.nucleus, ipa: syl.nucleus }, + { spelling: syl.coda, ipa: syl.coda }, + { spelling: syl.rhyme, ipa: syl.rhyme }, + tone, + notes, + ); + return { ok: "" }; + } catch (e) { + // console.log("well fuck", syl); + // console.error(e); + return { error: `meh ${e}` }; + } +} +async function handleIdiom(idiom: string): AsyncRes<string> { + pdb.addIdiom(idiom, "th"); + // TODO later set idiom_words once all words are populated + // console.log(); + return { ok: "" }; +} +// ช้า ๆ +// งก ๆ +// หงก ๆ + +async function getFrequency() { + const files = [ + "/home/y/code/prosody/resources/langdata/thai/data/1yin_freq.csv", + "/home/y/code/prosody/resources/langdata/thai/data/2yin_freq.csv", + "/home/y/code/prosody/resources/langdata/thai/data/3yin_freq.csv", + "/home/y/code/prosody/resources/langdata/thai/data/4yin_freq.csv", + "/home/y/code/prosody/resources/langdata/thai/data/5yin_freq.csv", + "/home/y/code/prosody/resources/langdata/thai/data/6yin_freq.csv", + ]; + const freqMap = new Map<number, string>(); + for (const file of files) { + await handleFile(file, (line, idx) => { + const [spelling, IPA, tone, length, frequency, ...rest] = line.split(","); + freqMap.set(Number(frequency!), spelling!); + }); + } + const orderedMap = new Map<string, number>(); + const keys = Array.from(freqMap.keys()).sort(); + for (let i = 0; i < keys.length; i++) { + const val = freqMap.get(keys[i]!)!; + orderedMap.set(val, i + 1); + } + return orderedMap; +} + +readDump("th"); diff --git a/src/lib/types/phonetics.ts b/src/lib/types/phonetics.ts index 0009e78..f7289c7 100644 --- a/src/lib/types/phonetics.ts +++ b/src/lib/types/phonetics.ts @@ -20,3 +20,7 @@ export type Syllable = { rhyme: Phoneme; tone: Tone; }; + +export type ToneQuery = Array<string | null>; +export type MutationType = { change: string } | { keep: string }; +export type MutationOrder = MutationType[]; diff --git a/src/lib/utils.ts b/src/lib/utils.ts index 0674dea..0f0c084 100644 --- a/src/lib/utils.ts +++ b/src/lib/utils.ts @@ -63,3 +63,8 @@ export function cleanIpa(ipa: string): string { const r2 = /[\[\]\/]/g; return ipa.replace(r1, "").replace(r2, ""); } + +export function randomFromArray<T>(arr: T[]): T { + const idx = Math.floor(Math.random() * arr.length); + return arr[idx]!; +} diff --git a/src/pages/api/tts.ts b/src/pages/api/tts.ts new file mode 100644 index 0000000..bd9a697 --- /dev/null +++ b/src/pages/api/tts.ts @@ -0,0 +1,81 @@ +// import db from "../../lib/db"; +import { randomFromArray } from "@/lib/utils"; +import { z } from "zod"; + +export const GET = async (request: Request): Promise<Response> => { + const url = URL.parse(request.url)!; + const params = url?.searchParams; + const word = params.get("word")!; + const lang = params.get("lang")!; + + try { + const res = await tts(word, lang); + return res; + } catch (error) { + return Response.json({ message: "Failure" }, { status: 500 }); + } +}; +const thaiVoices = [ + [ + "s3://voice-cloning-zero-shot/4353be7d-8cd3-4452-9e0b-bc4078c240d7/original/manifest.json", + "PlayDialog", + ], + [ + "s3://voice-cloning-zero-shot/4c495e1a-1352-4187-99eb-6e5dc7d55059/original/manifest.json", + "PlayDialog", + ], + [ + "s3://voice-cloning-zero-shot/59933136-5aca-4f42-827f-d354649c62a2/original/manifest.json", + "PlayDialog", + ], + [ + "s3://voice-cloning-zero-shot/ba9eb1c9-8897-4c41-9c79-f2cb428544a8/original/manifest.json", + "PlayDialog", + ], + [ + "s3://voice-cloning-zero-shot/bb585812-1c85-4a16-90f7-09c24b6c8186/original/manifest.json", + "PlayDialog", + ], + [ + "s3://voice-cloning-zero-shot/e1357526-c162-441b-afb9-285d3d21b9b4/original/manifest.json", + "PlayDialog", + ], + [ + "s3://voice-cloning-zero-shot/edd305a3-9cd2-4dd6-873f-9efc1f73aefc/original/manifest.json", + "PlayDialog", + ], + [ + "s3://voice-cloning-zero-shot/f80c355d-1075-4d2b-a53d-bb26aa4d1453/original/manifest.json", + "PlayDialog", + ], +]; + +async function tts(text: string, language: string) { + const USER_ID = Bun.env.PLAYHT_USER_ID!; + const API_KEY = Bun.env.PLAYHT_API_KEY!; + const [voice, voice_engine] = randomFromArray(thaiVoices); + console.log("tts", text); + const url = "https://api.play.ht/api/v2/tts/stream"; + const options = { + method: "POST", + headers: { + accept: "*/*", + "content-type": "application/json", + "X-USER-ID": USER_ID, + AUTHORIZATION: API_KEY, + }, + body: JSON.stringify({ + text, + voice, + // wav, mp3, ogg, flac, mulaw + output_format: "wav", + quality: "high", + voice_engine, + language, + temperature: 0.7, + }), + }; + + const res = await fetch(url, options); + return res; +} diff --git a/src/pages/tones.tsx b/src/pages/tones.tsx index 96ed56c..732ebd1 100644 --- a/src/pages/tones.tsx +++ b/src/pages/tones.tsx @@ -12,9 +12,9 @@ export const getConfig = async () => { // Function to fetch the initial word on the server async function InitialWordLoader() { // Fetch a random 1-syllable Thai word with any tone initially - const initialWord = await fetchWordsByToneAndSyllables(["rising", "mid"]); - console.log({ initialWord }); - return <ToneSelectorClient initialWord={initialWord} />; + const tones = ["falling", "falling"]; + const initialWords = await fetchWordsByToneAndSyllables(tones); + return <ToneSelectorClient initialData={initialWords} initialTones={tones} />; } // Loading fallback component |