parse.go 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623
  1. package main
  2. import (
  3. "errors"
  4. "fmt"
  5. "log"
  6. "net/http"
  7. "net/url"
  8. "regexp"
  9. "strconv"
  10. "strings"
  11. "time"
  12. "github.com/ChimeraCoder/anaconda"
  13. "github.com/Jeffail/gabs"
  14. "github.com/PuerkitoBio/goquery"
  15. "github.com/bwmarrin/discordgo"
  16. "github.com/fatih/color"
  17. "golang.org/x/net/html"
  18. )
  19. const (
  20. imgurClientID = "08af502a9e70d65"
  21. sneakyUserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
  22. )
  23. var (
  24. twitterClient *anaconda.TwitterApi
  25. )
  26. //#region Twitter
  27. func getTwitterUrls(inputURL string) (map[string]string, error) {
  28. parts := strings.Split(inputURL, ":")
  29. if len(parts) < 2 {
  30. return nil, errors.New("unable to parse Twitter URL")
  31. }
  32. return map[string]string{"https:" + parts[1] + ":orig": filenameFromURL(parts[1])}, nil
  33. }
  34. func getTwitterStatusUrls(inputURL string, m *discordgo.Message) (map[string]string, error) {
  35. if twitterClient == nil {
  36. return nil, errors.New("invalid Twitter API Keys Set")
  37. }
  38. if strings.Contains(inputURL, "/photo/") {
  39. inputURL = inputURL[:strings.Index(inputURL, "/photo/")]
  40. }
  41. if strings.Contains(inputURL, "/video/") {
  42. inputURL = inputURL[:strings.Index(inputURL, "/video/")]
  43. }
  44. matches := regexUrlTwitterStatus.FindStringSubmatch(inputURL)
  45. statusId, err := strconv.ParseInt(matches[4], 10, 64)
  46. if err != nil {
  47. return nil, err
  48. }
  49. tweet, err := twitterClient.GetTweet(statusId, nil)
  50. if err != nil {
  51. return nil, err
  52. }
  53. links := make(map[string]string)
  54. for _, tweetMedia := range tweet.ExtendedEntities.Media {
  55. if len(tweetMedia.VideoInfo.Variants) > 0 {
  56. var lastVideoVariant anaconda.Variant
  57. for _, videoVariant := range tweetMedia.VideoInfo.Variants {
  58. if videoVariant.Bitrate >= lastVideoVariant.Bitrate {
  59. lastVideoVariant = videoVariant
  60. }
  61. }
  62. if lastVideoVariant.Url != "" {
  63. links[lastVideoVariant.Url] = ""
  64. }
  65. } else {
  66. foundUrls := getDownloadLinks(tweetMedia.Media_url_https, m)
  67. for foundUrlKey, foundUrlValue := range foundUrls {
  68. links[foundUrlKey] = foundUrlValue
  69. }
  70. }
  71. }
  72. for _, tweetUrl := range tweet.Entities.Urls {
  73. foundUrls := getDownloadLinks(tweetUrl.Expanded_url, m)
  74. for foundUrlKey, foundUrlValue := range foundUrls {
  75. links[foundUrlKey] = foundUrlValue
  76. }
  77. }
  78. return links, nil
  79. }
  80. //#endregion
  81. //#region Instagram
  82. func getInstagramUrls(url string) (map[string]string, error) {
  83. username, shortcode := getInstagramInfo(url)
  84. filename := fmt.Sprintf("instagram %s - %s", username, shortcode)
  85. // if instagram video
  86. videoUrl := getInstagramVideoUrl(url)
  87. if videoUrl != "" {
  88. return map[string]string{videoUrl: filename + filepathExtension(videoUrl)}, nil
  89. }
  90. // if instagram album
  91. albumUrls := getInstagramAlbumUrls(url)
  92. if len(albumUrls) > 0 {
  93. links := make(map[string]string)
  94. for i, albumUrl := range albumUrls {
  95. links[albumUrl] = filename + " " + strconv.Itoa(i+1) + filepathExtension(albumUrl)
  96. }
  97. return links, nil
  98. }
  99. // if instagram picture
  100. afterLastSlash := strings.LastIndex(url, "/")
  101. mediaUrl := url[:afterLastSlash]
  102. mediaUrl += strings.Replace(strings.Replace(url[afterLastSlash:], "?", "&", -1), "/", "/media/?size=l", -1)
  103. return map[string]string{mediaUrl: filename + ".jpg"}, nil
  104. }
  105. func getInstagramInfo(url string) (string, string) {
  106. resp, err := http.Get(url)
  107. if err != nil {
  108. return "unknown", "unknown"
  109. }
  110. defer resp.Body.Close()
  111. z := html.NewTokenizer(resp.Body)
  112. ParseLoop:
  113. for {
  114. tt := z.Next()
  115. switch {
  116. case tt == html.ErrorToken:
  117. break ParseLoop
  118. }
  119. if tt == html.StartTagToken || tt == html.SelfClosingTagToken {
  120. t := z.Token()
  121. for _, a := range t.Attr {
  122. if a.Key == "type" {
  123. if a.Val == "text/javascript" {
  124. z.Next()
  125. content := string(z.Text())
  126. if strings.Contains(content, "window._sharedData = ") {
  127. content = strings.Replace(content, "window._sharedData = ", "", 1)
  128. content = content[:len(content)-1]
  129. jsonParsed, err := gabs.ParseJSON([]byte(content))
  130. if err != nil {
  131. log.Println(lg("API", "Instagram", color.HiRedString, "error parsing instagram json:\t"+err.Error()))
  132. continue ParseLoop
  133. }
  134. entryChildren, err := jsonParsed.Path("entry_data.PostPage").Children()
  135. if err != nil {
  136. log.Println(lg("API", "Instagram", color.HiRedString, "unable to find entries children:\t"+err.Error()))
  137. continue ParseLoop
  138. }
  139. for _, entryChild := range entryChildren {
  140. shortcode := entryChild.Path("graphql.shortcode_media.shortcode").Data().(string)
  141. username := entryChild.Path("graphql.shortcode_media.owner.username").Data().(string)
  142. return username, shortcode
  143. }
  144. }
  145. }
  146. }
  147. }
  148. }
  149. }
  150. return "unknown", "unknown"
  151. }
  152. func getInstagramVideoUrl(url string) string {
  153. resp, err := http.Get(url)
  154. if err != nil {
  155. return ""
  156. }
  157. defer resp.Body.Close()
  158. z := html.NewTokenizer(resp.Body)
  159. for {
  160. tt := z.Next()
  161. switch {
  162. case tt == html.ErrorToken:
  163. return ""
  164. }
  165. if tt == html.StartTagToken || tt == html.SelfClosingTagToken {
  166. t := z.Token()
  167. if t.Data == "meta" {
  168. for _, a := range t.Attr {
  169. if a.Key == "property" {
  170. if a.Val == "og:video" || a.Val == "og:video:secure_url" {
  171. for _, at := range t.Attr {
  172. if at.Key == "content" {
  173. return at.Val
  174. }
  175. }
  176. }
  177. }
  178. }
  179. }
  180. }
  181. }
  182. }
  183. func getInstagramAlbumUrls(url string) []string {
  184. var links []string
  185. resp, err := http.Get(url)
  186. if err != nil {
  187. return links
  188. }
  189. defer resp.Body.Close()
  190. z := html.NewTokenizer(resp.Body)
  191. ParseLoop:
  192. for {
  193. tt := z.Next()
  194. switch {
  195. case tt == html.ErrorToken:
  196. break ParseLoop
  197. }
  198. if tt == html.StartTagToken || tt == html.SelfClosingTagToken {
  199. t := z.Token()
  200. for _, a := range t.Attr {
  201. if a.Key == "type" {
  202. if a.Val == "text/javascript" {
  203. z.Next()
  204. content := string(z.Text())
  205. if strings.Contains(content, "window._sharedData = ") {
  206. content = strings.Replace(content, "window._sharedData = ", "", 1)
  207. content = content[:len(content)-1]
  208. jsonParsed, err := gabs.ParseJSON([]byte(content))
  209. if err != nil {
  210. log.Println(lg("API", "Instagram", color.HiRedString, "error parsing instagram json:\t%s", err))
  211. continue ParseLoop
  212. }
  213. entryChildren, err := jsonParsed.Path("entry_data.PostPage").Children()
  214. if err != nil {
  215. log.Println("Unable to find entries children: ", err)
  216. continue ParseLoop
  217. }
  218. for _, entryChild := range entryChildren {
  219. albumChildren, err := entryChild.Path("graphql.shortcode_media.edge_sidecar_to_children.edges").Children()
  220. if err != nil {
  221. continue ParseLoop
  222. }
  223. for _, albumChild := range albumChildren {
  224. link, ok := albumChild.Path("node.display_url").Data().(string)
  225. if ok {
  226. links = append(links, link)
  227. }
  228. }
  229. }
  230. }
  231. }
  232. }
  233. }
  234. }
  235. }
  236. if len(links) > 0 {
  237. log.Printf("Found instagram album with %d images (url: %s)\n", len(links), url)
  238. }
  239. return links
  240. }
  241. //#endregion
  242. //#region Imgur
  243. func getImgurSingleUrls(url string) (map[string]string, error) {
  244. url = regexp.MustCompile(`(r\/[^\/]+\/)`).ReplaceAllString(url, "") // remove subreddit url
  245. url = strings.Replace(url, "imgur.com/", "imgur.com/download/", -1)
  246. url = strings.Replace(url, ".gifv", "", -1)
  247. return map[string]string{url: ""}, nil
  248. }
  249. type imgurAlbumObject struct {
  250. Data []struct {
  251. Link string
  252. }
  253. }
  254. func getImgurAlbumUrls(url string) (map[string]string, error) {
  255. url = regexp.MustCompile(`(#[A-Za-z0-9]+)?$`).ReplaceAllString(url, "") // remove anchor
  256. afterLastSlash := strings.LastIndex(url, "/")
  257. albumId := url[afterLastSlash+1:]
  258. headers := make(map[string]string)
  259. headers["Authorization"] = "Client-ID " + imgurClientID
  260. imgurAlbumObject := new(imgurAlbumObject)
  261. getJSONwithHeaders("https://api.imgur.com/3/album/"+albumId+"/images", imgurAlbumObject, headers)
  262. links := make(map[string]string)
  263. for _, v := range imgurAlbumObject.Data {
  264. links[v.Link] = ""
  265. }
  266. if len(links) <= 0 {
  267. return getImgurSingleUrls(url)
  268. }
  269. log.Printf("Found imgur album with %d images (url: %s)\n", len(links), url)
  270. return links, nil
  271. }
  272. //#endregion
  273. //#region Streamable
  274. type streamableObject struct {
  275. Status int `json:"status"`
  276. Title string `json:"title"`
  277. Files struct {
  278. Mp4 struct {
  279. URL string `json:"url"`
  280. Width int `json:"width"`
  281. Height int `json:"height"`
  282. } `json:"mp4"`
  283. Mp4Mobile struct {
  284. URL string `json:"url"`
  285. Width int `json:"width"`
  286. Height int `json:"height"`
  287. } `json:"mp4-mobile"`
  288. } `json:"files"`
  289. URL string `json:"url"`
  290. ThumbnailURL string `json:"thumbnail_url"`
  291. Message interface{} `json:"message"`
  292. }
  293. func getStreamableUrls(url string) (map[string]string, error) {
  294. matches := regexUrlStreamable.FindStringSubmatch(url)
  295. shortcode := matches[3]
  296. if shortcode == "" {
  297. return nil, errors.New("unable to get shortcode from URL")
  298. }
  299. reqUrl := fmt.Sprintf("https://api.streamable.com/videos/%s", shortcode)
  300. streamable := new(streamableObject)
  301. getJSON(reqUrl, streamable)
  302. if streamable.Status != 2 || streamable.Files.Mp4.URL == "" {
  303. return nil, errors.New("streamable object has no download candidate")
  304. }
  305. link := streamable.Files.Mp4.URL
  306. if !strings.HasPrefix(link, "http") {
  307. link = "https:" + link
  308. }
  309. links := make(map[string]string)
  310. links[link] = ""
  311. return links, nil
  312. }
  313. //#endregion
  314. //#region Gfycat
  315. type gfycatObject struct {
  316. GfyItem struct {
  317. Mp4URL string `json:"mp4Url"`
  318. } `json:"gfyItem"`
  319. }
  320. func getGfycatUrls(url string) (map[string]string, error) {
  321. parts := strings.Split(url, "/")
  322. if len(parts) < 3 {
  323. return nil, errors.New("unable to parse Gfycat URL")
  324. }
  325. gfycatId := parts[len(parts)-1]
  326. gfycatObject := new(gfycatObject)
  327. getJSON("https://api.gfycat.com/v1/gfycats/"+gfycatId, gfycatObject)
  328. gfycatUrl := gfycatObject.GfyItem.Mp4URL
  329. if url == "" {
  330. return nil, errors.New("failed to read response from Gfycat")
  331. }
  332. return map[string]string{gfycatUrl: ""}, nil
  333. }
  334. //#endregion
  335. //#region Flickr
  336. type flickrPhotoSizeObject struct {
  337. Label string `json:"label"`
  338. Width int `json:"width"`
  339. Height int `json:"height"`
  340. Source string `json:"source"`
  341. URL string `json:"url"`
  342. Media string `json:"media"`
  343. }
  344. type flickrPhotoObject struct {
  345. Sizes struct {
  346. Canblog int `json:"canblog"`
  347. Canprint int `json:"canprint"`
  348. Candownload int `json:"candownload"`
  349. Size []flickrPhotoSizeObject `json:"size"`
  350. } `json:"sizes"`
  351. Stat string `json:"stat"`
  352. }
  353. func getFlickrUrlFromPhotoId(photoId string) string {
  354. reqUrl := fmt.Sprintf("https://www.flickr.com/services/rest/?format=json&nojsoncallback=1&method=%s&api_key=%s&photo_id=%s",
  355. "flickr.photos.getSizes", config.Credentials.FlickrApiKey, photoId)
  356. flickrPhoto := new(flickrPhotoObject)
  357. getJSON(reqUrl, flickrPhoto)
  358. var bestSize flickrPhotoSizeObject
  359. for _, size := range flickrPhoto.Sizes.Size {
  360. if bestSize.Label == "" {
  361. bestSize = size
  362. } else {
  363. if size.Width > bestSize.Width || size.Height > bestSize.Height {
  364. bestSize = size
  365. }
  366. }
  367. }
  368. return bestSize.Source
  369. }
  370. func getFlickrPhotoUrls(url string) (map[string]string, error) {
  371. if config.Credentials.FlickrApiKey == "" {
  372. return nil, errors.New("invalid Flickr API Key Set")
  373. }
  374. matches := regexUrlFlickrPhoto.FindStringSubmatch(url)
  375. photoId := matches[5]
  376. if photoId == "" {
  377. return nil, errors.New("unable to get Photo ID from URL")
  378. }
  379. return map[string]string{getFlickrUrlFromPhotoId(photoId): ""}, nil
  380. }
  381. type flickrAlbumObject struct {
  382. Photoset struct {
  383. ID string `json:"id"`
  384. Primary string `json:"primary"`
  385. Owner string `json:"owner"`
  386. Ownername string `json:"ownername"`
  387. Photo []struct {
  388. ID string `json:"id"`
  389. Secret string `json:"secret"`
  390. Server string `json:"server"`
  391. Farm int `json:"farm"`
  392. Title string `json:"title"`
  393. Isprimary string `json:"isprimary"`
  394. Ispublic int `json:"ispublic"`
  395. Isfriend int `json:"isfriend"`
  396. Isfamily int `json:"isfamily"`
  397. } `json:"photo"`
  398. Page int `json:"page"`
  399. PerPage int `json:"per_page"`
  400. Perpage int `json:"perpage"`
  401. Pages int `json:"pages"`
  402. Total string `json:"total"`
  403. Title string `json:"title"`
  404. } `json:"photoset"`
  405. Stat string `json:"stat"`
  406. }
  407. func getFlickrAlbumUrls(url string) (map[string]string, error) {
  408. if config.Credentials.FlickrApiKey == "" {
  409. return nil, errors.New("invalid Flickr API Key Set")
  410. }
  411. matches := regexUrlFlickrAlbum.FindStringSubmatch(url)
  412. if len(matches) < 10 || matches[9] == "" {
  413. return nil, errors.New("unable to find Flickr Album ID in URL")
  414. }
  415. albumId := matches[9]
  416. if albumId == "" {
  417. return nil, errors.New("unable to get Album ID from URL")
  418. }
  419. reqUrl := fmt.Sprintf("https://www.flickr.com/services/rest/?format=json&nojsoncallback=1&method=%s&api_key=%s&photoset_id=%s&per_page=500",
  420. "flickr.photosets.getPhotos", config.Credentials.FlickrApiKey, albumId)
  421. flickrAlbum := new(flickrAlbumObject)
  422. getJSON(reqUrl, flickrAlbum)
  423. links := make(map[string]string)
  424. for _, photo := range flickrAlbum.Photoset.Photo {
  425. links[getFlickrUrlFromPhotoId(photo.ID)] = ""
  426. }
  427. return links, nil
  428. }
  429. func getFlickrAlbumShortUrls(url string) (map[string]string, error) {
  430. result, err := http.Get(url)
  431. if err != nil {
  432. return nil, errors.New("Error getting long URL from shortened Flickr Album URL: " + err.Error())
  433. }
  434. if regexUrlFlickrAlbum.MatchString(result.Request.URL.String()) {
  435. return getFlickrAlbumUrls(result.Request.URL.String())
  436. }
  437. return nil, errors.New("encountered invalid URL while trying to get long URL from short Flickr Album URL")
  438. }
  439. //#endregion
  440. //#region Tistory
  441. // getTistoryUrls downloads tistory URLs
  442. // http://t1.daumcdn.net/cfile/tistory/[…] => http://t1.daumcdn.net/cfile/tistory/[…]
  443. // http://t1.daumcdn.net/cfile/tistory/[…]?original => as is
  444. func getTistoryUrls(link string) (map[string]string, error) {
  445. if !strings.HasSuffix(link, "?original") {
  446. link += "?original"
  447. }
  448. return map[string]string{link: ""}, nil
  449. }
  450. func getLegacyTistoryUrls(link string) (map[string]string, error) {
  451. link = strings.Replace(link, "/image/", "/original/", -1)
  452. return map[string]string{link: ""}, nil
  453. }
  454. func getTistoryWithCDNUrls(urlI string) (map[string]string, error) {
  455. parameters, _ := url.ParseQuery(urlI)
  456. if val, ok := parameters["fname"]; ok {
  457. if len(val) > 0 {
  458. if regexUrlTistoryLegacy.MatchString(val[0]) {
  459. return getLegacyTistoryUrls(val[0])
  460. }
  461. }
  462. }
  463. return nil, nil
  464. }
  465. func getPossibleTistorySiteUrls(url string) (map[string]string, error) {
  466. client := new(http.Client)
  467. request, err := http.NewRequest("HEAD", url, nil)
  468. if err != nil {
  469. return nil, err
  470. }
  471. request.Header.Add("Accept-Encoding", "identity")
  472. request.Header.Add("User-Agent", sneakyUserAgent)
  473. respHead, err := client.Do(request)
  474. if err != nil {
  475. return nil, err
  476. }
  477. contentType := ""
  478. for headerKey, headerValue := range respHead.Header {
  479. if headerKey == "Content-Type" {
  480. contentType = headerValue[0]
  481. }
  482. }
  483. if !strings.Contains(contentType, "text/html") {
  484. return nil, nil
  485. }
  486. request, err = http.NewRequest("GET", url, nil)
  487. if err != nil {
  488. return nil, err
  489. }
  490. request.Header.Add("Accept-Encoding", "identity")
  491. request.Header.Add("User-Agent", sneakyUserAgent)
  492. resp, err := client.Do(request)
  493. if err != nil {
  494. return nil, err
  495. }
  496. doc, err := goquery.NewDocumentFromResponse(resp)
  497. if err != nil {
  498. return nil, err
  499. }
  500. var links = make(map[string]string)
  501. doc.Find(".article img, #content img, div[role=main] img, .section_blogview img").Each(func(i int, s *goquery.Selection) {
  502. foundUrl, exists := s.Attr("src")
  503. if exists {
  504. if regexUrlTistoryLegacyWithCDN.MatchString(foundUrl) {
  505. finalTistoryUrls, _ := getTistoryWithCDNUrls(foundUrl)
  506. if len(finalTistoryUrls) > 0 {
  507. for finalTistoryUrl := range finalTistoryUrls {
  508. foundFilename := s.AttrOr("filename", "")
  509. links[finalTistoryUrl] = foundFilename
  510. }
  511. }
  512. } else if regexUrlTistoryLegacy.MatchString(foundUrl) {
  513. finalTistoryUrls, _ := getLegacyTistoryUrls(foundUrl)
  514. if len(finalTistoryUrls) > 0 {
  515. for finalTistoryUrl := range finalTistoryUrls {
  516. foundFilename := s.AttrOr("filename", "")
  517. links[finalTistoryUrl] = foundFilename
  518. }
  519. }
  520. }
  521. }
  522. })
  523. if len(links) > 0 {
  524. log.Printf("[%s] Found tistory album with %d images (url: %s)\n", time.Now().Format(time.Stamp), len(links), url)
  525. }
  526. return links, nil
  527. }
  528. //#endregion
  529. //#region Reddit
  530. // This is very crude but works for now
  531. type redditThreadObject []struct {
  532. Kind string `json:"kind"`
  533. Data struct {
  534. Children interface{} `json:"children"`
  535. } `json:"data"`
  536. }
  537. func getRedditPostUrls(link string) (map[string]string, error) {
  538. redditThread := new(redditThreadObject)
  539. headers := make(map[string]string)
  540. headers["Accept-Encoding"] = "identity"
  541. headers["User-Agent"] = sneakyUserAgent
  542. err := getJSONwithHeaders(link+".json", redditThread, headers)
  543. if err != nil {
  544. return nil, fmt.Errorf("failed to parse json from reddit post:\t%s", err)
  545. }
  546. redditPost := (*redditThread)[0].Data.Children.([]interface{})[0].(map[string]interface{})
  547. redditPostData := redditPost["data"].(map[string]interface{})
  548. if redditPostData["url_overridden_by_dest"] != nil {
  549. redditLink := redditPostData["url_overridden_by_dest"].(string)
  550. filename := fmt.Sprintf("Reddit-%s_%s %s", redditPostData["subreddit"].(string), redditPostData["id"].(string), filenameFromURL(redditLink))
  551. return map[string]string{redditLink: filename}, nil
  552. }
  553. return nil, nil
  554. }
  555. //#endregion