You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

113 lines
4.0 KiB

4 years ago
  1. # USAGE
  2. # python search_bing_api.py --query "alan grant" --output dataset/alan_grant
  3. # python search_bing_api.py --query "ian malcolm" --output dataset/ian_malcolm
  4. # python search_bing_api.py --query "ellie sattler" --output dataset/ellie_sattler
  5. # python search_bing_api.py --query "john hammond jurassic park" --output dataset/john_hammond
  6. # python search_bing_api.py --query "owen grady jurassic world" --output dataset/owen_grady
  7. # python search_bing_api.py --query "claire dearing jurassic world" --output dataset/claire_dearing
  8. # import the necessary packages
  9. from requests import exceptions
  10. import argparse
  11. import requests
  12. import cv2
  13. import os
  14. # construct the argument parser and parse the arguments
  15. ap = argparse.ArgumentParser()
  16. ap.add_argument("-q", "--query", required=True,
  17. help="search query to search Bing Image API for")
  18. ap.add_argument("-o", "--output", required=True,
  19. help="path to output directory of images")
  20. args = vars(ap.parse_args())
  21. # set your Microsoft Cognitive Services API key along with (1) the
  22. # maximum number of results for a given search and (2) the group size
  23. # for results (maximum of 50 per request)
  24. API_KEY = "INSERT_YOUR_API_KEY_HERE"
  25. MAX_RESULTS = 100
  26. GROUP_SIZE = 50
  27. # set the endpoint API URL
  28. URL = "https://api.cognitive.microsoft.com/bing/v7.0/images/search"
  29. # when attemping to download images from the web both the Python
  30. # programming language and the requests library have a number of
  31. # exceptions that can be thrown so let's build a list of them now
  32. # so we can filter on them
  33. EXCEPTIONS = set([IOError, FileNotFoundError,
  34. exceptions.RequestException, exceptions.HTTPError,
  35. exceptions.ConnectionError, exceptions.Timeout])
  36. # store the search term in a convenience variable then set the
  37. # headers and search parameters
  38. term = args["query"]
  39. headers = {"Ocp-Apim-Subscription-Key" : API_KEY}
  40. params = {"q": term, "offset": 0, "count": GROUP_SIZE}
  41. # make the search
  42. print("[INFO] searching Bing API for '{}'".format(term))
  43. search = requests.get(URL, headers=headers, params=params)
  44. search.raise_for_status()
  45. # grab the results from the search, including the total number of
  46. # estimated results returned by the Bing API
  47. results = search.json()
  48. estNumResults = min(results["totalEstimatedMatches"], MAX_RESULTS)
  49. print("[INFO] {} total results for '{}'".format(estNumResults,
  50. term))
  51. # initialize the total number of images downloaded thus far
  52. total = 0
  53. # loop over the estimated number of results in `GROUP_SIZE` groups
  54. for offset in range(0, estNumResults, GROUP_SIZE):
  55. # update the search parameters using the current offset, then
  56. # make the request to fetch the results
  57. print("[INFO] making request for group {}-{} of {}...".format(
  58. offset, offset + GROUP_SIZE, estNumResults))
  59. params["offset"] = offset
  60. search = requests.get(URL, headers=headers, params=params)
  61. search.raise_for_status()
  62. results = search.json()
  63. print("[INFO] saving images for group {}-{} of {}...".format(
  64. offset, offset + GROUP_SIZE, estNumResults))
  65. # loop over the results
  66. for v in results["value"]:
  67. # try to download the image
  68. try:
  69. # make a request to download the image
  70. print("[INFO] fetching: {}".format(v["contentUrl"]))
  71. r = requests.get(v["contentUrl"], timeout=30)
  72. # build the path to the output image
  73. ext = v["contentUrl"][v["contentUrl"].rfind("."):]
  74. p = os.path.sep.join([args["output"], "{}{}".format(
  75. str(total).zfill(8), ext)])
  76. # write the image to disk
  77. f = open(p, "wb")
  78. f.write(r.content)
  79. f.close()
  80. # catch any errors that would not unable us to download the
  81. # image
  82. except Exception as e:
  83. # check to see if our exception is in our list of
  84. # exceptions to check for
  85. if type(e) in EXCEPTIONS:
  86. print("[INFO] skipping: {}".format(v["contentUrl"]))
  87. continue
  88. # try to load the image from disk
  89. image = cv2.imread(p)
  90. # if the image is `None` then we could not properly load the
  91. # image from disk (so it should be ignored)
  92. if image is None:
  93. print("[INFO] deleting: {}".format(p))
  94. os.remove(p)
  95. continue
  96. # update the counter
  97. total += 1