#!/usr/bin/env bash
# HTML 기반 도서관 웹사이트 분석 도구
# SSR HTML 사이트(wkcms, kolaseek, jnet 등) — 세션 쿠키·폼 구조 분석에 특화
# SPA/JSON API 분석은 scripts/analyze-library.sh 사용

set -euo pipefail

cmd=${1:-help}
shift 2>/dev/null || true

COOKIE_FILE=$(mktemp)
trap "rm -f '$COOKIE_FILE'" EXIT

_usage() {
  cat <<'EOF'
사용법: bash scripts/analyze-html-library.sh <명령> [인수...]

명령:
  session-init  URL                            세션 쿠키(JSESSIONID) 획득 및 출력
  session-search INIT_URL SEARCH_URL           세션 초기화 후 검색 요약 출력
  list-codes    URL [INPUT_NAME]               HTML 폼에서 도서관 코드 추출 (기본: manage_code)
  form-info     URL                            폼 action, hidden 입력값, 체크박스 name 목록
  html-context  URL PATTERN [N]               HTML에서 패턴 주변 컨텍스트 출력 (기본 N=300자)
  perf-test     INIT_URL SEARCH_TPL VALUES..   display 값별 응답 시간 측정 (세션 필요)
  wkcms-codes   HOST                           wkcms 플랫폼 도서관 코드 자동 수집 (포트 9080)
  ssl-check     HOST[:PORT]                    SSL 인증서 체인 확인

예시:
  bash scripts/analyze-html-library.sh wkcms-codes "lib.geoje.go.kr"

  bash scripts/analyze-html-library.sh session-init \
    "https://lib.geoje.go.kr:9080/wkcms/KBookSearch/BookSearchPage/MA"

  bash scripts/analyze-html-library.sh session-search \
    "https://lib.geoje.go.kr:9080/wkcms/KBookSearch/BookSearchPage/MA" \
    "https://lib.geoje.go.kr:9080/wkcms/KBookSearch/BookNomalSearch/MA?search_txt=별&book_type=BOOK&pageno=1&display=20&detail_search_type=Nomal&manage_code=MA&option=nomal&libcode=ALL&input_search_text=별&real_search_text=별&now_search_txt=별&hidden_book_type=BOOK&orderby=ASC&orderby_item=TITLE_INFO_SORT"

  bash scripts/analyze-html-library.sh list-codes \
    "https://example.library.go.kr/search.do" searchLibraryArr

  bash scripts/analyze-html-library.sh form-info \
    "https://example.library.go.kr/searchResultList.do"

  bash scripts/analyze-html-library.sh html-context \
    "https://lib.geoje.go.kr:9080/wkcms/KBookSearch/BookNomalSearch/MA?..." "ul.book_info"

  bash scripts/analyze-html-library.sh perf-test \
    "https://HOST:9080/wkcms/KBookSearch/BookSearchPage/MA" \
    "https://HOST:9080/wkcms/KBookSearch/BookNomalSearch/MA?search_txt=별&display=__DISPLAY__&book_type=BOOK&pageno=1&detail_search_type=Nomal&manage_code=MA&option=nomal&libcode=ALL&input_search_text=별&real_search_text=별&now_search_txt=별&hidden_book_type=BOOK&orderby=ASC&orderby_item=TITLE_INFO_SORT" \
    10 20 50 100 200

  bash scripts/analyze-html-library.sh ssl-check "lib.geoje.go.kr:9080"

EOF
}

case "$cmd" in

  session-init)
    url="${1:?오류: URL 필요}"
    echo "[세션 초기화: $url]"
    status=$(curl -sk -o /dev/null -w "%{http_code}" -c "$COOKIE_FILE" "$url")
    echo "HTTP 상태: $status"
    echo ""
    echo "[획득한 쿠키]"
    grep -v '^#' "$COOKIE_FILE" | grep -v '^$' || echo "(쿠키 없음)"
    ;;

  session-search)
    init_url="${1:?오류: INIT_URL 필요}"
    search_url="${2:?오류: SEARCH_URL 필요}"
    tmpfile=$(mktemp)
    trap "rm -f '$tmpfile' '$COOKIE_FILE'" EXIT
    echo "[세션 초기화: $init_url]"
    curl -sk -o /dev/null -c "$COOKIE_FILE" "$init_url"
    echo "[검색 요청: $search_url]"
    curl -sk -b "$COOKIE_FILE" "$search_url" > "$tmpfile"
    python3 - "$tmpfile" <<'PYEOF'
import sys, re
with open(sys.argv[1]) as f:
    html = f.read()
total_match = re.search(r'총\s*([\d,]+)\s*건', html)
print('총 건수:', total_match.group(1) if total_match else '확인 안 됨')
print('응답 크기:', f'{len(html):,}바이트')
# 책 제목 샘플: wkcms는 h4, 기타는 dt.tit a
titles = re.findall(r'<h4[^>]*>\s*(.*?)\s*</h4>', html, re.DOTALL)[:3]
if not titles:
    titles = re.findall(r'<dt[^>]*class="tit"[^>]*>.*?<a[^>]*>(.*?)</a>', html, re.DOTALL)[:3]
if titles:
    print('\n[책 제목 샘플]')
    for t in titles:
        print(' -', re.sub(r'<[^>]+>', '', t).strip()[:80])
else:
    print('\n책 제목을 찾지 못했습니다. html-context 명령으로 구조 확인 권장')
PYEOF
    ;;

  list-codes)
    url="${1:?오류: URL 필요}"
    input_name="${2:-manage_code}"
    tmpfile=$(mktemp)
    trap "rm -f '$tmpfile' '$COOKIE_FILE'" EXIT
    echo "[도서관 코드 추출: $url]"
    echo "[파라미터 name: $input_name]"
    echo ""
    curl -skL "$url" > "$tmpfile"
    python3 - "$input_name" "$tmpfile" <<'PYEOF'
import sys, re
input_name = sys.argv[1]
with open(sys.argv[2]) as f:
    html = f.read()

print(f'[응답 크기: {len(html):,}바이트]')

# 방법 1: checkbox 또는 radio (name 앞 또는 뒤, 값은 영문+숫자 모두 허용)
p1 = rf"name=['\"]?{re.escape(input_name)}['\"]?[^>]+value=['\"]?([A-Za-z0-9]+)['\"]?"
p2 = rf"value=['\"]?([A-Za-z0-9]+)['\"]?[^>]+name=['\"]?{re.escape(input_name)}['\"]?"
codes = [m for m in re.findall(p1, html)] + [m for m in re.findall(p2, html)]

# 방법 2: <select> option
sel_pat = rf'(?s)<select[^>]+name=["\']?{re.escape(input_name)}["\']?.*?</select>'
sel = re.search(sel_pat, html)
if sel:
    codes = re.findall(r'<option[^>]+value="([^"]+)"', sel.group())

unique = [c for c in dict.fromkeys(codes) if c not in ('ALL', '', '0')]
if unique:
    print(f'{len(unique)}개 코드 발견:')
    for c in unique:
        print(f'  {c}')
else:
    print(f'"{input_name}" 파라미터를 찾을 수 없습니다.')
    names = list(dict.fromkeys(re.findall(r'name=["\']([^"\']+)["\']', html)))
    print('힌트 — 페이지의 파라미터 name 목록:', names[:20])
    if len(html) < 500:
        print('[응답 내용 (짧음)]')
        print(html[:500])
PYEOF
    ;;

  form-info)
    url="${1:?오류: URL 필요}"
    tmpfile=$(mktemp)
    trap "rm -f '$tmpfile' '$COOKIE_FILE'" EXIT
    echo "[폼 정보 추출: $url]"
    echo ""
    curl -skL "$url" > "$tmpfile"
    python3 - "$tmpfile" <<'PYEOF'
import sys, re
with open(sys.argv[1]) as f:
    html = f.read()
forms = re.findall(r'(?s)<form[^>]*>.*?</form>', html)
print(f'총 {len(forms)}개 폼')
print()
for i, form in enumerate(forms[:5]):
    action = re.search(r'action=["\']([^"\']+)["\']', form)
    method = re.search(r'method=["\']([^"\']+)["\']', form, re.I)
    print(f'--- 폼 {i+1} ---')
    print(f'  action : {action.group(1) if action else "(없음)"}')
    print(f'  method : {method.group(1).upper() if method else "GET"}')
    hiddens = re.findall(r'<input[^>]+type=["\']hidden["\'][^>]+>', form, re.I)
    if hiddens:
        print(f'  hidden ({len(hiddens)}개):')
        for h in hiddens[:10]:
            n = re.search(r'name=["\']([^"\']+)["\']', h)
            v = re.search(r'value=["\']([^"\']*)["\']', h)
            if n:
                print(f'    {n.group(1)} = {v.group(1) if v else ""}')
    checkboxes = re.findall(r'<input[^>]+type=["\']checkbox["\'][^>]+>', form, re.I)
    if checkboxes:
        cb_names = list(dict.fromkeys(
            re.search(r'name=["\']([^"\']+)["\']', c).group(1)
            for c in checkboxes if re.search(r'name=["\']', c)
        ))
        print(f'  checkbox names : {cb_names[:8]}')
    print()
PYEOF
    ;;

  html-context)
    url="${1:?오류: URL 필요}"
    pattern="${2:?오류: PATTERN 필요}"
    n="${3:-300}"
    tmpfile=$(mktemp)
    trap "rm -f '$tmpfile'" EXIT
    curl -sk "$url" > "$tmpfile"
    python3 - "$tmpfile" "$pattern" "$n" <<'PYEOF'
import sys, re
path, pattern, n = sys.argv[1], sys.argv[2], int(sys.argv[3])
with open(path) as f:
    content = f.read()
escaped = re.escape(pattern)
matches = re.findall(fr'.{{0,{n}}}{escaped}.{{0,{n}}}', content, re.DOTALL)
if not matches:
    print(f'"{pattern}" 주변 컨텍스트를 찾을 수 없습니다.')
    print(f'응답 크기: {len(content):,}바이트')
    sys.exit(0)
for i, m in enumerate(matches[:3]):
    print(f'--- match {i+1}/{min(len(matches),3)} ---')
    print(m[:1000])
    print()
PYEOF
    ;;

  perf-test)
    init_url="${1:?오류: INIT_URL 필요}"
    search_tpl="${2:?오류: SEARCH_TPL (display=__DISPLAY__ 포함) 필요}"
    shift 2
    if [ $# -eq 0 ]; then
      echo "오류: 테스트할 display 값 목록이 필요합니다" >&2
      echo "예시: bash scripts/analyze-html-library.sh perf-test INIT_URL TPL 10 20 50 100 200" >&2
      exit 1
    fi
    echo "[성능 테스트]"
    echo "세션 초기화: $init_url"
    echo "검색 템플릿: ${search_tpl:0:80}..."
    echo ""
    for val in "$@"; do
      search_url="${search_tpl/__DISPLAY__/$val}"
      curl -sk -o /dev/null -c "$COOKIE_FILE" "$init_url"
      printf "display=%-6s " "$val:"
      curl -sk -b "$COOKIE_FILE" -o /dev/null -w "HTTP %{http_code}  %{time_total}s\n" "$search_url"
    done
    ;;

  wkcms-codes)
    host="${1:?오류: HOST 필요}"
    host="${host%/}"
    wkcms_url="https://$host:9080/wkcms/KBookSearch/BookSearchPage/MA"
    tmpfile=$(mktemp)
    trap "rm -f '$tmpfile' '$COOKIE_FILE'" EXIT
    echo "[wkcms 도서관 코드 수집: $host]"
    echo "URL: $wkcms_url"
    echo ""
    curl -sk -c "$COOKIE_FILE" "$wkcms_url" -o /dev/null
    curl -sk -b "$COOKIE_FILE" "$wkcms_url" > "$tmpfile"
    python3 - "$host" "$tmpfile" <<'PYEOF'
import sys, re
host = sys.argv[1]
with open(sys.argv[2]) as f:
    html = f.read()

# manage_code 체크박스 추출 (속성 순서 무관)
inputs = re.findall(r'<input[^>]+>', html, re.I)
codes = []
for inp in inputs:
    if 'manage_code' in inp.lower():
        val = re.search(r'value=["\']([A-Z]+)["\']', inp)
        if val and val.group(1) not in ('ALL', ''):
            codes.append(val.group(1))

unique = list(dict.fromkeys(codes))
if unique:
    print(f'총 {len(unique)}개 도서관 코드:')
    for c in unique:
        print(f'  {c}')
    print()
    print('[libraryList 템플릿]')
    print('const libraryList: LibraryInfo[] = [')
    for c in unique:
        print(f'  {{ code: "{c}", name: "TODO" }},')
    print('];')
else:
    print('코드를 찾지 못했습니다.')
    print(f'응답 크기: {len(html):,}바이트')
    if len(html) < 1000:
        print('[응답 내용]')
        print(html[:500])
    else:
        names = list(dict.fromkeys(re.findall(r'name=["\']([^"\']+)["\']', html)))
        print('발견된 name 속성 목록:', names[:20])
        print()
        print('힌트: wkcms 플랫폼이 아닐 수 있습니다.')
        print('      포트 9080이 맞는지, /wkcms/ 경로가 맞는지 확인하세요.')
PYEOF
    ;;

  ssl-check)
    host="${1:?오류: HOST[:PORT] 필요}"
    if [[ "$host" == *:* ]]; then
      hostname="${host%:*}"
      port="${host##*:}"
    else
      hostname="$host"
      port="443"
    fi
    echo "[SSL 인증서 체인 확인: $hostname:$port]"
    openssl s_client -connect "$hostname:$port" -servername "$hostname" 2>&1 | head -20
    ;;

  help|--help|-h)
    _usage
    ;;

  *)
    echo "알 수 없는 명령: $cmd" >&2
    _usage >&2
    exit 1
    ;;
esac
