diff --git a/internal/httpapi/server.go b/internal/httpapi/server.go
index a9d3d75..9daf4ec 100644
--- a/internal/httpapi/server.go
+++ b/internal/httpapi/server.go
@@ -16,6 +16,7 @@ import (
"sojuboy/internal/store"
"sojuboy/internal/summarizer"
+ xhtml "golang.org/x/net/html"
)
type Metrics struct {
@@ -217,45 +218,37 @@ func (s *Server) handleLinkCard(w http.ResponseWriter, r *http.Request) {
_ = json.NewEncoder(w).Encode(s.cardCache[raw])
return
}
- // fetch minimal HTML and extract tags (very lightweight, no full readability here)
- // For brevity, we only parse a few tags by string search to keep dependencies minimal in this step
+ // fetch minimal HTML and extract tags using a tolerant HTML parser
client := &http.Client{ Timeout: 10 * time.Second }
req, _ := http.NewRequestWithContext(r.Context(), http.MethodGet, raw, nil)
resp, err := client.Do(req)
if err != nil { w.WriteHeader(http.StatusBadGateway); _, _ = w.Write([]byte("fetch error")); return }
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 { w.WriteHeader(http.StatusBadGateway); _, _ = w.Write([]byte("bad status")); return }
- // limit to 256KB
+ // limit to 256KB and parse tokens
limited := http.MaxBytesReader(w, resp.Body, 262144)
- b, _ := io.ReadAll(limited)
- html := string(b)
- // naive meta parsing
- get := func(names ...string) string {
- for _, n := range names {
- // look for content="..."
- idx := strings.Index(strings.ToLower(html), strings.ToLower(n))
- if idx >= 0 {
- // slice forward
- sfx := html[idx:]
- ic := strings.Index(strings.ToLower(sfx), "content=")
- if ic >= 0 {
- sfx = sfx[ic+8:]
- // trim quotes
- if len(sfx) > 0 && (sfx[0] == '"' || sfx[0] == '\'') {
- q := sfx[0]
- sfx = sfx[1:]
- iq := strings.IndexByte(sfx, q)
- if iq >= 0 { return strings.TrimSpace(sfx[:iq]) }
- }
- }
+ doc, err := xhtml.Parse(limited)
+ if err != nil { w.WriteHeader(http.StatusBadGateway); _, _ = w.Write([]byte("parse error")); return }
+ var title, desc, img string
+ var walker func(*xhtml.Node)
+ getAttr := func(n *xhtml.Node, key string) string { for a := n.Attr; a != nil && len(n.Attr) > 0; a = nil { for _, at := range n.Attr { if strings.EqualFold(at.Key, key) { return at.Val } } ; return "" } }
+ walker = func(n *xhtml.Node) {
+ if n.Type == xhtml.ElementNode && strings.EqualFold(n.Data, "meta") {
+ // property or name + content
+ var pn = ""; var nm = ""; var content = ""
+ for _, a := range n.Attr { if strings.EqualFold(a.Key, "property") { pn = a.Val } else if strings.EqualFold(a.Key, "name") { nm = a.Val } else if strings.EqualFold(a.Key, "content") { content = a.Val } }
+ key := strings.ToLower(pn)
+ if key == "" { key = strings.ToLower(nm) }
+ switch key {
+ case "og:title", "twitter:title": if title == "" { title = content }
+ case "og:description", "twitter:description": if desc == "" { desc = content }
+ case "og:image", "twitter:image": if img == "" { img = content }
}
}
- return ""
+ for c := n.FirstChild; c != nil; c = c.NextSibling { walker(c) }
}
- card := linkCard{ URL: raw }
- card.Title = get("property=\"og:title\"","name=\"og:title\"","name=\"twitter:title\"")
- card.Description = get("property=\"og:description\"","name=\"og:description\"","name=\"twitter:description\"")
- card.Image = get("property=\"og:image\"","name=\"og:image\"","name=\"twitter:image\"")
+ walker(doc)
+ card := linkCard{ URL: raw, Title: strings.TrimSpace(title), Description: strings.TrimSpace(desc), Image: strings.TrimSpace(img) }
// cache for 24h
s.cardCache[raw] = card
s.cardCacheExp[raw] = time.Now().Add(24 * time.Hour)
@@ -393,7 +386,7 @@ func (s *Server) handleUI(w http.ResponseWriter, r *http.Request) {