Spaces:

jsulz
/

spaces-ship

Build error

App Files Files Community

jsulz commited on Sep 11, 2024

Commit

87f778f

1 Parent(s): c1b5e3a

rough work done; now to trim back, analyze, and then publish

Browse files

Files changed (3) hide show

app.py +261 -143
poetry.lock +31 -1
pyproject.toml +1 -0

app.py CHANGED Viewed

@@ -1,53 +1,17 @@
 import gradio as gr
 import pandas as pd
 import numpy as np
-# Load the spaces.parquet file as a dataframe
-df = pd.read_parquet("spaces.parquet")
 """
 Todos:
-    Create tabbed interface for filtering and graphs
-    plotly graph showing the growth of spaces over time
-    plotly graph showing the breakdown of spaces by sdk
-    plotly graph of colors
-    plotly graph of emojis
-    Plotly graph of hardware
-    Investigate README lengths
-    bar chart of the number of spaces per author
-    Is there a correlation between pinning a space and the number of likes?
-    Is a correlation between the emoji and the number of likes?
-    distribution of python versions
-    what models are most used
-    what organizations are most popular in terms of their models and datasets being used
-    most duplicated spaces
-        "id",
-        "author",
-        "created_at",
-        "last_modified",
-        "subdomain",
-        "host",
-        "likes",
-        "sdk",
-        "tags",
-        "readme_size",
-        "python_version",
-        "license",
-        "duplicated_from",
-        "models",
-        "datasets",
-        "emoji",
-        "colorFrom",
-        "colorTo",
-        "pinned",
-        "stage",
-        "hardware",
-        "devMode",
-        "custom_domains",
 """
-def filtered_df(emoji, likes, author, hardware, tags, models, datasets):
     _df = df
     # if emoji is not none, filter the dataframe with it
     if emoji:
@@ -80,118 +44,272 @@ def filtered_df(emoji, likes, author, hardware, tags, models, datasets):
                 )
             )
         ]
-    return _df
-with gr.Blocks() as demo:
-    df = df[df["stage"] == "RUNNING"]
-    # combine the sdk and tags columns, one of which is a string and the other is an array of strings
-    # first convert the sdk column to an array of strings
-    df["sdk"] = df["sdk"].apply(lambda x: np.array([x]))
-    # then combine the sdk and tags columns so that their elements are together
-    df["sdk_tags"] = df[["sdk", "tags"]].apply(
-        lambda x: np.concatenate((x[0], x[1])), axis=1
     )
-    # where the custom_domains column is not null, use that as the url, otherwise, use the host column
-    df["url"] = np.where(
-        df["custom_domains"].isnull(),
-        df["id"],
-        df["custom_domains"],
-    )
-    emoji = gr.Dropdown(
-        df["emoji"].unique().tolist(), label="Search by Emoji 🤗", multiselect=True
-    )  # Dropdown to select the emoji
-    likes = gr.Slider(
-        minimum=df["likes"].min(),
-        maximum=df["likes"].max(),
-        step=1,
-        label="Filter by Likes",
-    )  # Slider to filter by likes
-    hardware = gr.Dropdown(
-        df["hardware"].unique().tolist(), label="Search by Hardware", multiselect=True
-    )
-    author = gr.Dropdown(
-        df["author"].unique().tolist(), label="Search by Author", multiselect=True
-    )
-    # get the list of unique strings in the sdk_tags column
-    sdk_tags = np.unique(np.concatenate(df["sdk_tags"].values))
-    # create a dropdown for the sdk_tags
-    sdk_tags = gr.Dropdown(
-        sdk_tags.tolist(), label="Filter by SDK/Tags", multiselect=True
-    )
-    # create a gradio checkbox group for hardware
-    hardware = gr.CheckboxGroup(
-        df["hardware"].unique().tolist(), label="Filter by Hardware"
-    )
-    space_license = gr.CheckboxGroup(
-        df["license"].unique().tolist(), label="Filter by license"
-    )
-    # Assuming df is your dataframe and 'array_column' is the column containing np.array of strings
-    array_column_as_lists = df["models"].apply(
-        lambda x: np.array(["None"]) if np.ndim(x) == 0 else x
-    )
-    # Now, flatten all arrays into one list
-    flattened_strings = np.concatenate(array_column_as_lists.values)
-    # Get unique strings
-    unique_strings = np.unique(flattened_strings)
-    # Convert to a list if needed
-    unique_strings_list = unique_strings.tolist()
-    models = gr.Dropdown(
-        unique_strings_list,
-        label="Search by Model",
-        multiselect=True,
-    )
-    # Assuming df is your dataframe and 'array_column' is the column containing np.array of strings
-    array_column_as_lists = df["datasets"].apply(
-        lambda x: np.array(["None"]) if np.ndim(x) == 0 else x
-    )
-    # Now, flatten all arrays into one list
-    flattened_strings = np.concatenate(array_column_as_lists.values)
-    # Get unique strings
-    unique_strings = np.unique(flattened_strings)
-    # Convert to a list if needed
-    unique_strings_list = unique_strings.tolist()
-    datasets = gr.Dropdown(
-        unique_strings_list,
-        label="Search by Model",
-        multiselect=True,
-    )
-    devMode = gr.Checkbox(value=False, label="DevMode Enabled")
-    clear = gr.ClearButton(components=[emoji])
-    df = pd.DataFrame(
-        df[
-            [
-                "id",
-                "emoji",
-                "author",
-                "url",
-                "likes",
-                "hardware",
-                "sdk_tags",
-                "models",
-                "datasets",
             ]
-        ]
-    )
-    df["url"] = df["url"].apply(
-        lambda x: (
-            f"<a target='_blank' href=https://huggingface.co/spaces/{x}>{x}</a>"
-            if x is not None and "/" in x
-            else f"<a target='_blank' href=https://{x[0]}>{x[0]}</a>"
         )
-    )
-    gr.DataFrame(
-        filtered_df,
-        inputs=[emoji, likes, author, hardware, sdk_tags, models, datasets],
-        datatype="html",
-    )
 demo.launch()

 import gradio as gr
 import pandas as pd
 import numpy as np
+import plotly.express as px
+# Load the spaces.parquet file as a dataframe and do some pre cleaning steps
 """
 Todos:
+    Clean up existing filtering code
 """
+def filtered_df(emoji, likes, author, hardware, tags, models, datasets, space_licenses):
     _df = df
     # if emoji is not none, filter the dataframe with it
     if emoji:
                 )
             )
         ]
+    if space_licenses:
+        _df = _df[
+            _df["licenses"].apply(
+                lambda x: (
+                    any(space_license in x for space_license in space_licenses)
+                    if x is not None
+                    else False
+                )
+            )
+        ]
+    # rename the columns names to make them more readable
+    _df = _df.rename(
+        columns={
+            'url': 'URL',
+            'likes': 'Likes',
+            "r_models": "Models",
+            "r_datasets": "Datasets",
+            "r_licenses": "Licenses",
+        }
     )
+    return _df[["URL", "Likes", "Models", "Datasets", "Licenses" ]]
+with gr.Blocks(fill_width=True) as demo:
+    with gr.Tab(label="Spaces Overview"):
+        # The Pandas dataframe has a datetime column. Plot the growth of spaces (row entries) over time.
+        # The x-axis should be the date and the y-axis should be the cumulative number of spaces created up to that date .
+        df = pd.read_parquet("spaces.parquet")
+        df = df.sort_values("created_at")
+        df['cumulative_spaces'] = df['created_at'].rank(method='first').astype(int)
+        fig1 = px.line(df, x='created_at', y='cumulative_spaces', title='Growth of Spaces Over Time', labels={'created_at': 'Date', 'cumulative_spaces': 'Number of Spaces'}, template='plotly_dark')
+        gr.Plot(fig1)
+        # Create a pie charge showing the distribution of spaces by SDK
+        fig2 = px.pie(df, names='sdk', title='Distribution of Spaces by SDK', template='plotly_dark')
+        gr.Plot(fig2)
+        # create a pie chart showing the distribution of spaces by emoji for the top 10 used emojis
+        emoji_counts = df['emoji'].value_counts().head(10).reset_index()
+        fig3 = px.pie(emoji_counts, names='emoji', values='count', title='Distribution of Spaces by Emoji', template='plotly_dark')
+        gr.Plot(fig3)
+        # Create a dataframe with the top 10 authors and the number of spaces they have created
+        author_counts = df['author'].value_counts().head(20).reset_index()
+        author_counts.columns = ['Author', 'Number of Spaces']
+        gr.DataFrame(author_counts)
+        # Create a scatter plot showing the relationship between the number of likes and the number of spaces created by an author
+        author_likes = df.groupby('author').agg({'likes': 'sum', 'id': 'count'}).reset_index()
+        fig4 = px.scatter(author_likes, x='id', y='likes', title='Relationship between Number of Spaces Created and Number of Likes', labels={'id': 'Number of Spaces Created', 'likes': 'Number of Likes'}, hover_data={'author': True}, template='plotly_dark')
+        gr.Plot(fig4)
+        # Create a scatter plot showing the relationship between the number of likes and the number of spaces created by an author
+        emoji_likes = df.groupby('emoji').agg({'likes': 'sum', 'id': 'count'}).sort_values(by='likes', ascending=False).head(20).reset_index()
+        fig10 = px.scatter(emoji_likes, x='id', y='likes', title='Relationship between Number of Spaces Created and Number of Likes', labels={'id': 'Number of Spaces Created', 'likes': 'Number of Likes'}, hover_data={'emoji': True}, template='plotly_dark')
+        gr.Plot(fig10)
+        # Create a bar chart of hardware in use
+        hardware = df['hardware'].value_counts().reset_index()
+        hardware.columns = ['Hardware', 'Number of Spaces']
+        fig5 = px.bar(hardware, x='Hardware', y='Number of Spaces', title='Hardware in Use', labels={'Hardware': 'Hardware', 'Number of Spaces': 'Number of Spaces (log scale)'}, color='Hardware', template='plotly_dark')
+        fig5.update_layout(yaxis_type='log')
+        gr.Plot(fig5)
+        models = np.concatenate([arr for arr in df['models'].values if arr is not None])
+        model_count = {}
+        model_author_count = {}
+        for model in models:
+            author = model.split('/')[0]
+            if model in model_count:
+                model_count[model] += 1
+            else:
+                model_count[model] = 1
+            if author in model_author_count:
+                model_author_count[author] += 1
+            else:
+                model_author_count[author] = 1
+        model_author_count = pd.DataFrame(model_author_count.items(), columns=['Model Author', 'Number of Spaces'])
+        fig8 = px.bar(model_author_count.sort_values('Number of Spaces', ascending=False).head(20), x='Model Author', y='Number of Spaces', title='Most Popular Model Authors', labels={'Model': 'Model', 'Number of Spaces': 'Number of Spaces'}, template='plotly_dark')
+        gr.Plot(fig8)
+        model_count = pd.DataFrame(model_count.items(), columns=['Model', 'Number of Spaces'])
+        # then make a bar chart
+        fig6 = px.bar(model_count.sort_values('Number of Spaces', ascending=False).head(20), x='Model', y='Number of Spaces', title='Most Used Models', labels={'Model': 'Model', 'Number of Spaces': 'Number of Spaces'}, template='plotly_dark')
+        gr.Plot(fig6)
+        datasets = np.concatenate([arr for arr in df['datasets'].values if arr is not None])
+        dataset_count = {}
+        dataset_author_count = {}
+        for dataset in datasets:
+            author = dataset.split('/')[0]
+            if dataset in dataset_count:
+                dataset_count[dataset] += 1
+            else:
+                dataset_count[dataset] = 1
+            if author in dataset_author_count:
+                dataset_author_count[author] += 1
+            else:
+                dataset_author_count[author] = 1
+        dataset_count = pd.DataFrame(dataset_count.items(), columns=['Datasets', 'Number of Spaces'])
+        dataset_author_count = pd.DataFrame(dataset_author_count.items(), columns=['Dataset Author', 'Number of Spaces'])
+        fig9 = px.bar(dataset_author_count.sort_values('Number of Spaces', ascending=False).head(20), x='Dataset Author', y='Number of Spaces', title='Most Popular Dataset Authors', labels={'Dataset Author': 'Dataset Author', 'Number of Spaces': 'Number of Spaces'}, template='plotly_dark')
+        gr.Plot(fig9)
+        # then make a bar chart
+        fig7 = px.bar(dataset_count.sort_values('Number of Spaces', ascending=False).head(20), x='Datasets', y='Number of Spaces', title='Most Used Datasets', labels={'Datasets': 'Datasets', 'Number of Spaces': 'Number of Spaces'}, template='plotly_dark')
+        gr.Plot(fig7)
+        # Get the most duplicated spaces
+        duplicated_spaces = df['duplicated_from'].value_counts().head(20).reset_index()
+        duplicated_spaces.columns = ['Space', 'Number of Duplicates']
+        gr.DataFrame(duplicated_spaces)
+        # Get the most duplicated spaces
+        liked_spaces = df[['id', 'likes']].sort_values(by='likes', ascending=False).head(20)
+        liked_spaces.columns = ['Space', 'Number of Likes']
+        gr.DataFrame(liked_spaces)
+        # Get the spaces with the longest READMEs
+        readme_sizes = df[['id', 'readme_size']].sort_values(by='readme_size', ascending=False).head(20)
+        readme_sizes.columns = ['Space', 'Longest READMEs']
+        gr.DataFrame(readme_sizes)
+    with gr.Tab(label="Spaces Search"):
+        df = pd.read_parquet("spaces.parquet")
+        df = df[df["stage"] == "RUNNING"]
+        # combine the sdk and tags columns, one of which is a string and the other is an array of strings
+        # first convert the sdk column to an array of strings
+        df["sdk"] = df["sdk"].apply(lambda x: np.array([str(x)]))
+        df["licenses"] = df["license"].apply(
+            lambda x: np.array([str(x)]) if x is None else x
+        )
+        # then combine the sdk and tags columns so that their elements are together
+        df["sdk_tags"] = df[["sdk", "tags"]].apply(
+            lambda x: np.concatenate((x.iloc[0], x.iloc[1])), axis=1
+        )
+        df['emoji'] = np.where(df['emoji'].isnull(), '', df['emoji'])
+        # where the custom_domains column is not null, use that as the url, otherwise, use the host column
+        df["url"] = np.where(
+            df["custom_domains"].isnull(),
+            df["id"],
+            df["custom_domains"],
+        )
+        df["url"] = df[["url", "emoji"]].apply(
+            lambda x: (
+                f"<a target='_blank' href=https://huggingface.co/spaces/{x.iloc[0]}>{str(x.iloc[1]) + " " + x.iloc[0]}</a>"
+                if x.iloc[0] is not None and "/" in x.iloc[0]
+                else f"<a target='_blank' href=https://{x.iloc[0][0]}>{str(x.iloc[1]) + " " + x.iloc[0][0]}</a>"
+            ),
+            axis=1,
+        )
+        # Make all of this human readable
+        df["r_models"] = [', '.join(models) if models is not None else '' for models in df["models"]]
+        df["r_sdk_tags"] = [', '.join(sdk_tags) if sdk_tags is not None else '' for sdk_tags in df["sdk_tags"]]
+        df["r_datasets"] = [', '.join(datasets) if datasets is not None else '' for datasets in df["datasets"]]
+        df["r_licenses"] = [', '.join(licenses) if licenses is not None else '' for licenses in df["licenses"]]
+        emoji = gr.Dropdown(
+            df["emoji"].unique().tolist(), label="Search by Emoji 🤗", multiselect=True
+        )  # Dropdown to select the emoji
+        likes = gr.Slider(
+            minimum=df["likes"].min(),
+            maximum=df["likes"].max(),
+            step=1,
+            label="Filter by Likes",
+        )  # Slider to filter by likes
+        hardware = gr.Dropdown(
+            df["hardware"].unique().tolist(), label="Search by Hardware", multiselect=True
+        )
+        author = gr.Dropdown(
+            df["author"].unique().tolist(), label="Search by Author", multiselect=True
+        )
+        # get the list of unique strings in the sdk_tags column
+        sdk_tags = np.unique(np.concatenate(df["sdk_tags"].values))
+        # create a dropdown for the sdk_tags
+        sdk_tags = gr.Dropdown(
+            sdk_tags.tolist(), label="Filter by SDK/Tags", multiselect=True
+        )
+        # create a gradio checkbox group for hardware
+        hardware = gr.CheckboxGroup(
+            df["hardware"].unique().tolist(), label="Filter by Hardware"
+        )
+        licenses = np.unique(np.concatenate(df["licenses"].values))
+        space_license = gr.CheckboxGroup(licenses.tolist(), label="Filter by license")
+        # If the models column is none make it an array of "none" so that things don't break
+        models_column_to_list = df["models"].apply(
+            lambda x: np.array(["None"]) if np.ndim(x) == 0 else x
+        )
+        # Now, flatten all arrays into one list
+        models_flattened = np.concatenate(models_column_to_list.values)
+        # Get unique strings
+        unique_models = np.unique(models_flattened)
+        models = gr.Dropdown(
+            unique_models.tolist(),
+            label="Search by Model",
+            multiselect=True,
+        )
+        # Do the same for datasets that we did for models
+        datasets_column_to_list = df["datasets"].apply(
+            lambda x: np.array(["None"]) if np.ndim(x) == 0 else x
+        )
+        flattened_datasets = np.concatenate(datasets_column_to_list.values)
+        unique_datasets = np.unique(flattened_datasets)
+        datasets = gr.Dropdown(
+            unique_datasets.tolist(),
+            label="Search by Dataset",
+            multiselect=True,
+        )
+        devMode = gr.Checkbox(value=False, label="DevMode Enabled")
+        clear = gr.ClearButton(components=[
+                emoji,
+                author,
+                hardware,
+                sdk_tags,
+                models,
+                datasets,
+                space_license
+                ])
+        df = pd.DataFrame(
+            df[
+                [
+                    "id",
+                    "emoji",
+                    "author",
+                    "url",
+                    "likes",
+                    "hardware",
+                    "sdk_tags",
+                    "models",
+                    "datasets",
+                    "licenses",
+                    "r_sdk_tags",
+                    "r_models",
+                    "r_datasets",
+                    "r_licenses",
+                ]
             ]
         )
+        gr.DataFrame(
+            filtered_df,
+            inputs=[
+                emoji,
+                likes,
+                author,
+                hardware,
+                sdk_tags,
+                models,
+                datasets,
+                space_license,
+            ],
+            datatype="html",
+            wrap=True,
+            column_widths=["25%", "5%", "25%", "25%", "20%"]
+        )
 demo.launch()

poetry.lock CHANGED Viewed

@@ -1648,6 +1648,21 @@ tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "pa
 typing = ["typing-extensions"]
 xmp = ["defusedxml"]
 [[package]]
 name = "pyarrow"
 version = "17.0.0"
@@ -2093,6 +2108,21 @@ anyio = ">=3.4.0,<5"
 [package.extras]
 full = ["httpx (>=0.22.0)", "itsdangerous", "jinja2", "python-multipart (>=0.0.7)", "pyyaml"]
 [[package]]
 name = "tomlkit"
 version = "0.12.0"
@@ -2519,4 +2549,4 @@ multidict = ">=4.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.12"
-content-hash = "63092c19114798c87a85b63a2e684aa5497c8e081fe5121e91c46e442a6f6e6e"

 typing = ["typing-extensions"]
 xmp = ["defusedxml"]
+[[package]]
+name = "plotly"
+version = "5.24.0"
+description = "An open-source, interactive data visualization library for Python"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "plotly-5.24.0-py3-none-any.whl", hash = "sha256:0e54efe52c8cef899f7daa41be9ed97dfb6be622613a2a8f56a86a0634b2b67e"},
+    {file = "plotly-5.24.0.tar.gz", hash = "sha256:eae9f4f54448682442c92c1e97148e3ad0c52f0cf86306e1b76daba24add554a"},
+]
+[package.dependencies]
+packaging = "*"
+tenacity = ">=6.2.0"
 [[package]]
 name = "pyarrow"
 version = "17.0.0"
 [package.extras]
 full = ["httpx (>=0.22.0)", "itsdangerous", "jinja2", "python-multipart (>=0.0.7)", "pyyaml"]
+[[package]]
+name = "tenacity"
+version = "9.0.0"
+description = "Retry code until it succeeds"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "tenacity-9.0.0-py3-none-any.whl", hash = "sha256:93de0c98785b27fcf659856aa9f54bfbd399e29969b0621bc7f762bd441b4539"},
+    {file = "tenacity-9.0.0.tar.gz", hash = "sha256:807f37ca97d62aa361264d497b0e31e92b8027044942bfa756160d908320d73b"},
+]
+[package.extras]
+doc = ["reno", "sphinx"]
+test = ["pytest", "tornado (>=4.5)", "typeguard"]
 [[package]]
 name = "tomlkit"
 version = "0.12.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.12"
+content-hash = "462f1993751686e196fc4b665537755237673c202245650318bcdcfbd89485ea"

pyproject.toml CHANGED Viewed

@@ -11,6 +11,7 @@ python = "^3.12"
 gradio = "^4.42.0"
 datasets = "^2.21.0"
 pandas = "^2.2.2"
 [build-system]

 gradio = "^4.42.0"
 datasets = "^2.21.0"
 pandas = "^2.2.2"
+plotly = "^5.24.0"
 [build-system]